In [None]:
get_ipython().run_cell_magic('capture', '', '!pip install db-dtypes\n!pip install keras\n!pip install tensorflow\n')

In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [None]:
get_ipython().run_cell_magic('capture', '', 'import pandas as pd\nimport numpy as np\nimport os\nfrom google.cloud import bigquery\nfrom google.oauth2 import service_account\nfrom google.cloud.bigquery import magics\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.svm import SVR\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import mean_absolute_error, mean_squared_error\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.model_selection import RepeatedKFold\nfrom sklearn.linear_model import BayesianRidge\nfrom sklearn.linear_model import ElasticNetCV\nfrom keras.models import Sequential\nfrom keras.layers import Bidirectional, LSTM, Dropout, Dense\nfrom keras.models import load_model\nimport joblib\nfrom joblib import Parallel, delayed\nfrom scipy import stats\nfrom sklearn.ensemble import IsolationForest\n')

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
query_main = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""

In [None]:
query_job = bigquery_client.query(query_main)
unemployment_data = query_job.to_dataframe()

In [None]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""

In [None]:
query_job = bigquery_client.query(query)
wage_data = query_job.to_dataframe()

In [None]:
query_pred = """
SELECT * 
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [None]:
query_job = bigquery_client.query(query_pred)
prediction_list = query_job.to_dataframe()

In [None]:
def evaluate_regressor(prediction_dataframe):
    # Takes in a prediction dataframe of 2 columns, Actual values and Predicted values generated by a regressor
    # Outputs MSE, MAR, RMSE and MAPE metrics. Must have columns named Actual and Predicted.
    print('MSE:', mean_squared_error(prediction_dataframe['Actual'], prediction_dataframe['Predicted']))
    print('MAE:', mean_absolute_error(prediction_dataframe['Actual'], prediction_dataframe['Predicted']))
    print('RMSE:', np.sqrt(mean_squared_error(prediction_dataframe['Actual'], prediction_dataframe['Predicted'])))
    print('MAPE:', np.mean(np.abs((prediction_dataframe['Actual'] - prediction_dataframe['Predicted']) / prediction_dataframe['Actual'])) * 100)

In [None]:
def get_predictions(regressor, model_type, name, week):
    # generates predictions for any model and writes out a dataframe in csv containing them
    # takes a regressor and learning method type as input: DL and ML
    # DL/ML variable basically changes the shape for an input from a 2D array to 3D arry, as required tensor shape
    result_list = []
    uu_id_transform = LE.fit_transform(prediction_list['uu_id'])
    if model_type == 'DL':
        predict_arr = np.array(SC_other.transform([[-0.04, -0.140, 0.328, -0.671, -0.420, -0.432, -0.0013, -0.0023, -0.347, -0.0004, 3.211, -0.532, -0.329]]))
        for val in uu_id_transform:
            to_predict = np.insert(predict_arr, 0, val, axis=1)
            to_predict = np.insert(to_predict, 1, week, axis=1)
            to_predict = np.reshape(to_predict, (to_predict.shape[0], to_predict.shape[1],1))
            r = regressor.predict(to_predict)
            result_list.append(r)
        result_list = np.array(result_list)
        result_list = np.reshape(result_list, (525,))
    elif model_type == 'ML':
        predict_arr = np.array(RB_other.transform([[-0.04, -0.140, 0.328, -0.671, -0.420, -0.432, -0.0013, -0.0023, -0.347, -0.0004, 3.211, -0.532, -0.329]]))
        for val in uu_id_transform:
            to_predict = np.insert(predict_arr, 0, val, axis=1)
            to_predict = np.insert(to_predict, 1, week, axis=1)
            r = regressor.predict(to_predict)
            result_list.append(r)
    result_df = pd.DataFrame(result_list, columns = ['Predictions'])
    prediction_sub = prediction_list.copy()
    prediction_sub['total_claims'] = result_df.values
    prediction_sub = prediction_sub[['uu_id','total_claims','week_number']]
    os.makedirs('lost+found/submission_files', exist_ok=True)
    prediction_sub.to_csv('lost+found/submission_files/'+name+'.csv', index=False)
    return prediction_sub

In [None]:
def get_pred_frame(test_frame, prediction_array):
    prediction_frame = pd.DataFrame({'Actual': test_frame, 'Predicted': prediction_array.flatten()})
    return prediction_frame

In [None]:
def clearOutlier_IQR(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    no_outliers = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
    print(no_outliers.shape)
    return no_outliers

In [None]:
# outlier detection and handling - Z Score (gaussian only)
def clearOutlier_ZScore(data, threshold):
    zscore = np.abs(stats.zscore(data))
    thresh = threshold
    no_outliers = data[(zscore < thresh).all(axis=1)]
    return no_outliers

In [None]:
# outlier detection - automatic
def IsoForest_anomaly(data):
    IFO = IsolationForest(random_state=69)
    col_list = ['week_number', 'total_claims', 'edu_8th_or_less',
                'edu_grades_9_11', 'edu_hs_grad_equiv', 'edu_post_hs', 'gender_female',
                'gender_male', 'race_amerindian', 'race_asian', 'race_black',
                'race_hawaiiannative', 'race_other', 'race_white', 'average_wage']
    NO_df = data.copy()
    IFO.fit(data)
    NO_df['anomaly_scores'] = IFO.decision_function(data)
    NO_df['anomaly'] = IFO.predict(data)
    no_outlier = NO_df[NO_df['anomaly'] == 1]
    print('Removed ', NO_df[NO_df['anomaly'] == -1].shape[0], 'datapoints')
    palette = ['#ff7f0e','#1f77b4']
    sns.pairplot(NO_df, vars = col_list, hue='anomaly', palette=palette)
    no_outlier.drop(['anomaly_scores','anomaly'], axis = 1, inplace=True)
    return no_outlier

In [None]:
def preprocess(data, scaling):
    no_outlierDF = ingest.copy()
    to_drop = ['timeperiod','tract','top_category_employer1','top_category_employer2',
           'top_category_employer3','tract_name','countyfips', 'edu_unknown', 'gender_na', 
           'race_noanswer']
    to_scale = ['edu_8th_or_less', 'edu_grades_9_11', 'edu_hs_grad_equiv', 'edu_post_hs', 
            'gender_female', 'gender_male', 'race_amerindian', 'race_asian', 'race_black', 
            'race_hawaiiannative', 'race_other', 'race_white', 'average_wage']
    no_outlierDF.drop(to_drop, axis=1, inplace=True)
    no_outlierDF['uu_id'] = LE.fit_transform(no_outlierDF['uu_id'])
    if scaling == 'Robust':
        no_outlierDF[to_scale] = RB_other.fit_transform(no_outlierDF[to_scale])
    elif scaling == 'Standard':
        no_outlierDF[to_scale] = SC_other.fit_transform(no_outlierDF[to_scale])
    return no_outlierDF

In [None]:
# updated_ingest = pd.concat([merged_ingest, combined_ingest])
ingest = pd.read_csv('lost+found/submission_files/complete_ingest.csv')

In [None]:
# quick preprocess to keep uu_id and scale values
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
LE = LabelEncoder()
RB_other = RobustScaler()
SC_other = StandardScaler()
# RB_claims = RobustScaler()

In [None]:
ML_data = preprocess(ingest, 'Robust')
ingest_clean = IsoForest_anomaly(ML_data)

In [None]:
# set target and independent variables
Y = ingest_clean['total_claims']
X = ingest_clean[['uu_id', 'week_number', 'edu_8th_or_less',
                 'edu_grades_9_11', 'edu_hs_grad_equiv', 'edu_post_hs', 'gender_female',
                 'gender_male', 'race_amerindian', 'race_asian', 'race_black',
                 'race_hawaiiannative', 'race_other', 'race_white', 'average_wage']]

In [None]:
# import
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.20, random_state=69)

In [None]:
# load model - next time, I'll need to learn to use PMML
RFR_Regressor = joblib.load('RF_v1-7.pkl')

In [None]:
Y_pred_RFR = RFR_Regressor.predict(X_test.values).reshape(-1,1)

In [None]:
# evaluate
evaluate_regressor(get_pred_frame(Y_test,Y_pred_RFR))

In [None]:
# call func
get_predictions(RFR_Regressor, 'ML', 'submission_prediction_output_RFR', 44)

In [None]:
plt.figure(figsize=(15,10))
plt.plot(Y_test.values, color = 'red', label = 'Actual values')
plt.plot(Y_pred_RFR, color='blue', label='Predicted values')
plt.title('Model Prediction Visual')
plt.legend()
plt.show()

In [None]:
# this needs different feature engineering, so I'm starting from scratch
DL_data = preprocess(ingest, 'Standard')

In [None]:
DL_ingest = IsoForest_anomaly(DL_data)

In [None]:
# split set
DL_Y = DL_ingest['total_claims']
DL_X = DL_ingest[['uu_id', 'week_number', 'edu_8th_or_less', 'edu_grades_9_11', 'edu_hs_grad_equiv', 'edu_post_hs', 'gender_female', 'gender_male',
               'race_amerindian', 'race_asian', 'race_black', 'race_hawaiiannative', 'race_other', 'race_white', 'average_wage']]
DL_XTrain, DL_XTest, DL_YTrain, DL_YTest = train_test_split(DL_X, DL_Y, test_size=0.20, random_state=69)

In [None]:
# change to np vectors
DL_XTrain = DL_XTrain.to_numpy()
DL_XTest = DL_XTest.to_numpy()

In [None]:
# reshape because F*** tensors
DL_XTrain = np.reshape(DL_XTrain, (DL_XTrain.shape[0], DL_XTrain.shape[1], 1))

In [None]:
# convert X and Y train to float because input dtype accepts floats only
DL_YTrain = DL_YTrain.astype(float)
DL_XTrain = DL_XTrain.astype(float)

In [None]:
# load model
StackLSTM_Regressor = load_model('BiDLSTM_v1-05.h5')

In [None]:
# summary for viewers
StackLSTM_Regressor.summary()

In [None]:
# float cast
DL_XTest = DL_XTest.astype(float)
# make predictions
DL_XTest = np.reshape(DL_XTest, (DL_XTest.shape[0], DL_XTest.shape[1],1))
predictions = StackLSTM_Regressor.predict(DL_XTest)

In [None]:
get_pred_frame(DL_YTest, predictions)

In [None]:
evaluate_regressor(get_pred_frame(DL_YTest, predictions))

In [None]:
plt.figure(figsize=(15,10))
plt.plot(DL_YTest.values, color = 'red', label = 'Actual values')
plt.plot(predictions, color='blue', label='Predicted values')
plt.title('Model Prediction Visual')
plt.legend()
plt.show()

In [None]:
get_predictions(StackLSTM_Regressor, 'DL', 'submission_prediction_output', 44)

In [None]:
def get_predictions(regressor, model_type, name, week):
    # generates predictions for any model and writes out a dataframe in csv containing them
    # takes a regressor and learning method type as input: DL and ML
    # DL/ML variable basically changes the shape for an input from a 2D array to 3D arry, as required tensor shape
    result_list = []
    uu_id_transform = LE.fit_transform(prediction_list['uu_id'])
    if model_type == 'DL':
        predict_arr = np.array(SC_other.transform([[0, 1, 6, 4, 4, 5, 0, 1, 2, 0, 4, 1, 4200.0]]))
        for val in uu_id_transform:
            to_predict = np.insert(predict_arr, 0, val, axis=1)
            to_predict = np.insert(to_predict, 1, week, axis=1)
            to_predict = np.reshape(to_predict, (to_predict.shape[0], to_predict.shape[1],1))
            r = regressor.predict(to_predict)
            result_list.append(r)
        result_list = np.array(result_list)
        result_list = np.reshape(result_list, (525,))
    elif model_type == 'ML':
        predict_arr = np.array(RB_other.transform([[0, 1, 6, 4, 4, 5, 0, 1, 2, 0, 4, 1, 4200.0]]))
        for val in uu_id_transform:
            to_predict = np.insert(predict_arr, 0, val, axis=1)
            to_predict = np.insert(to_predict, 1, week, axis=1)
            r = regressor.predict(to_predict)
            result_list.append(r)
    result_df = pd.DataFrame(result_list, columns = ['Predictions'])
    prediction_sub = prediction_list.copy()
    prediction_sub['total_claims'] = result_df.values
    prediction_sub = prediction_sub[['uu_id','total_claims','week_number']]
    os.makedirs('lost+found/submission_files', exist_ok=True)
    prediction_sub.to_csv('lost+found/submission_files/'+name+'.csv', index=False)
    return prediction_sub

In [None]:
# call func
get_predictions(RFR_Regressor, 'ML', 'submission_prediction_output_RFR', 44)

In [None]:
get_pred_frame(DL_YTest, predictions)

In [None]:
evaluate_regressor(get_pred_frame(DL_YTest, predictions))

In [None]:
get_predictions(StackLSTM_Regressor, 'DL', 'submission_prediction_output', 44)