In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from sklearn import preprocessing
# from sklearn.utils import shuffle
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, MultiTaskElasticNet, MultiTaskLasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import confusion_matrix, mean_squared_error, log_loss, accuracy_score
import time
# from sklearn.manifold import TSNE
import joblib


# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
pd.__version__

'0.24.2'

In [3]:
Xtrain = np.load('data/Xtrain_fake.npy')
Xdev = np.load('data/Xdev_fake.npy')
Xtest = np.load('data/Xtest_fake.npy')

In [4]:
ytrain = np.load('data/ytrain_fake.npy')
ydev = np.load('data/ydev_fake.npy')
ytest = np.load('data/ytest_fake.npy')

In [9]:
def hyp_tuning(model_name, Xtrain, ytrain, Xdev, ydev):
    best_results = {}
    best_dev_mse = 1000
    
    for hyp in range(1, 10):
        
        print('='*20)
        print('starting to compute hyp={}'.format(hyp))
        
        if model_name == 'KNN':
            model = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=hyp))
        elif model_name == 'RF':
            model = MultiOutputRegressor(RandomForestRegressor(n_estimators=hyp,
                                                          #max_depth=max_depth,
                                                          random_state=0))
        else:
            raise ValueError('Unknown model_name')                               

        model.fit(Xtrain, ytrain)
        
        pred_train = model.predict(Xtrain)
        mse_train = mean_squared_error(ytrain, pred_train)

        pred_dev = model.predict(Xdev)
        mse_dev = mean_squared_error(ydev, pred_dev)
        
        if mse_dev < best_dev_mse:
            best_dev_mse = mse_dev
            print('Up to now the best dev_mse is {}'.format(best_dev_mse))
            print('and the associated training_mse is {}'.format(mse_train))
            best_results['Dev_mse'] = best_dev_mse
            best_results['Train_mse'] = mse_train
            
            best_model = model
            best_hyp = hyp
            
    save_path = 'results/baseline_{}_with_{}.joblib'.format(model_name, best_hyp)
    joblib.dump(best_model, save_path)
    
    return best_results

In [10]:
knn_best_results = hyp_tuning('KNN', Xtrain, ytrain, Xdev, ydev)

starting to compute hyp=1
Up to now the best dev_mse is 0.10234985476743712
and the associated training_mse is 0.0
starting to compute hyp=2
Up to now the best dev_mse is 0.08465692519429463
and the associated training_mse is 0.42284243542451544
starting to compute hyp=3
starting to compute hyp=4
starting to compute hyp=5
Up to now the best dev_mse is 0.08393667070164451
and the associated training_mse is 1.0140709275467523
starting to compute hyp=6
Up to now the best dev_mse is 0.07278864640279205
and the associated training_mse is 1.0936818186322534
starting to compute hyp=7
Up to now the best dev_mse is 0.06818806633878849
and the associated training_mse is 1.159180766788052
starting to compute hyp=8
starting to compute hyp=9


In [12]:
knn_best_results

{'Dev_mse': 0.06818806633878849, 'Train_mse': 1.159180766788052}

In [13]:
knn_best_model = joblib.load('results/baseline_KNN_with_7.joblib')
pred_test = knn_best_model.predict(Xtest)
mse_test = mean_squared_error(ytest, pred_test)

print('The test_mse of the best KNN model is {}'.format(mse_test))
knn_best_results['Test_mse'] = mse_test

The test_mse of the best KNN model is 0.4914082589694498


In [15]:
rf_best_results = hyp_tuning('RF', Xtrain, ytrain, Xdev, ydev)

starting to compute hyp=1
Up to now the best dev_mse is 12.638849038595891
and the associated training_mse is 0.40183168114755025
starting to compute hyp=2
Up to now the best dev_mse is 3.2075993008254264
and the associated training_mse is 0.4701808003788381
starting to compute hyp=3
starting to compute hyp=4
Up to now the best dev_mse is 3.155231662920647
and the associated training_mse is 0.1241974961462712
starting to compute hyp=5
starting to compute hyp=6
starting to compute hyp=7
starting to compute hyp=8
starting to compute hyp=9


In [16]:
rf_best_results

{'Dev_mse': 3.155231662920647, 'Train_mse': 0.1241974961462712}

In [17]:
rf_best_model = joblib.load('results/baseline_RF_with_4.joblib')
pred_test = rf_best_model.predict(Xtest)
mse_test = mean_squared_error(ytest, pred_test)

print('The test_mse of the best RF model is {}'.format(mse_test))
rf_best_results['Test_mse'] = mse_test

The test_mse of the best RF model is 12.011205353294924


## The Linear Regression Model

In [19]:
regr_multilr = MultiOutputRegressor(LinearRegression())
regr_multilr.fit(Xtrain, ytrain)
pred_train = regr_multilr.predict(Xtrain)
mse_train = mean_squared_error(ytrain, pred_train)

pred_dev = regr_multilr.predict(Xdev)
mse_dev = mean_squared_error(ydev, pred_dev)

pred_test = regr_multilr.predict(Xtest)
mse_test = mean_squared_error(ytest, pred_test)

lr_results = {}
lr_results['Dev_mse'] = mse_dev
lr_results['Train_mse'] = mse_train
lr_results['Test_mse'] = mse_test

joblib.dump(regr_multilr, 'results/baseline_LR.joblib')
print(lr_results)

{'Test_mse': 32610.618029070913, 'Dev_mse': 4.346446210908264, 'Train_mse': 9.228302212011216e-30}


# Below is the old version, don't use anymore

## Preparing data

In [6]:
cols_name = pd.Series(data=df.columns)
ar_04_beg_col_index = cols_name[cols_name == 'AR_exchange_04'].index[0]
ar_06_beg_col_index = cols_name[cols_name == 'AR_exchange_06'].index[0]
ar_06_end_col_index = cols_name[cols_name == 'AR_eslt_06'].index[0]

wl_AR_cols = cols_name[ar_04_beg_col_index : ar_06_end_col_index+1].tolist()

In [7]:
output_cols = wl_AR_cols[-12:]

In [8]:
df_train = df.loc[df['Train'] == 1]
df_test = df.loc[df['Train'] == 0]

In [9]:
df_train.drop(columns='Train', inplace=True)
df_test.drop(columns='Train', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [10]:
ytrain = df_train.loc[:, output_cols]

Xtrain = df_train.drop(columns=output_cols) # use profile + previous usage
# Xtrain = df_train.loc[:, wl_AR_cols[:-12]] # use previous usage only

In [11]:
dev_size = int(Xtrain.shape[0] * 0.2)

Xtrain = Xtrain.to_numpy()
ytrain = ytrain.to_numpy()

Xdev = Xtrain[-dev_size:]
ydev = ytrain[-dev_size:]

Xtrain = Xtrain[:-dev_size]
ytrain = ytrain[:-dev_size]


In [12]:
ytest = df_test.loc[:, output_cols]

Xtest = df_test.drop(columns=output_cols) 
# Xtest = df_test.loc[:, wl_AR_cols[:-12]] 

In [13]:
Xtest = Xtest.to_numpy()
ytest = ytest.to_numpy()

In [14]:
scaler = StandardScaler()

Xtrain = scaler.fit_transform(Xtrain)
Xdev = scaler.transform(Xdev)
Xtest = scaler.transform(Xtest)

In [46]:
Xtest.shape

(5208, 1075)

## Baseline models

In [28]:
knn_best = hyp_tuning('KNN', Xtrain, ytrain, Xdev, ydev)

starting to compute hyp=1
0.291397589932976
starting to compute hyp=2
0.25400886832455183
starting to compute hyp=3
starting to compute hyp=4
starting to compute hyp=5
starting to compute hyp=6
starting to compute hyp=7
starting to compute hyp=8
starting to compute hyp=9


In [None]:
rf_best = hyp_tuning('RF', Xtrain, ytrain, Xdev, ydev)

In [30]:
knn_best

{1: {'mse_dev': 0.291397589932976, 'mse_train': 0.0},
 2: {'mse_dev': 0.25400886832455183, 'mse_train': 0.03877537139647624}}

In [41]:
best_knn_hyp = list(knn_best.keys())[-1]
best_knn_hyp


2

In [42]:
best_knn_model_path = 'results/baseline_KNN_with_{}'.format(best_knn_hyp)

best_knn_model = joblib.load(best_knn_model_path)

In [48]:
pred_test = best_knn_model.predict(Xtest[:100])
print(mean_squared_error(ytest[:100], pred_test))

0.5502280806708065


In [20]:
regr_multilr = MultiOutputRegressor(LinearRegression())
regr_multilr.fit(Xtrain, ytrain)


MultiOutputRegressor(estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                                n_jobs=None, normalize=False),
                     n_jobs=None)

In [24]:
score = regr_multilr.score(Xdev,ydev)
print(score)

-5.423296019726179e+22


In [21]:
file_path = 'results/baseline_models_fake.pk1'
joblib.dump(regr_multilr, file_path)

['results/baseline_models_fake.pk1']

In [25]:
loaded_model = joblib.load(file_path)
score_ = loaded_model.score(Xdev, ydev)

In [26]:
score_

-5.423296019726179e+22

In [19]:
regr_multilr = MultiOutputRegressor(LinearRegression())
regr_multilr.fit(Xtrain, ytrain)

# regr_lr = LinearRegression()
# regr_lr.fit(Xtrain, ytrain)

# Predict on dev data
pred_dev_multilr = regr_multilr.predict(Xdev)
# pred_dev_lr = regr_lr.predict(Xdev)

# Predict on training data
pred_train_multilr = regr_multilr.predict(Xtrain)
# pred_train_lr = regr_lr.predict(Xtrain)

# Predict on test data
pred_test_multilr = regr_multilr.predict(Xtest)
# pred_test_lr = regr_lr.predict(Xtest)

print(mean_squared_error(ytrain, pred_train_multilr))
# print(mean_squared_error(ytrain, pred_train_lr))
print('='*20)

print(mean_squared_error(ydev, pred_dev_multilr))
# print(mean_squared_error(ydev, pred_dev_lr))
print('='*20)


print(mean_squared_error(ytest, pred_test_multilr))
# print(mean_squared_error(ytest, pred_test_lr))


0.034366045100794336
1.3860198866646668e+22
4.44830001519688e+23


In [85]:
HYP = 3

regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=HYP,
                                                          #max_depth=max_depth,
                                                          random_state=0))
regr_multirf.fit(Xtrain, ytrain)

regr_rf = RandomForestRegressor(n_estimators=HYP, 
                                #max_depth=max_depth,
                                random_state=2)
regr_rf.fit(Xtrain, ytrain)

# Predict on dev data
pred_dev_multirf = regr_multirf.predict(Xdev)
pred_dev_rf = regr_rf.predict(Xdev)

# Predict on training data
pred_train_multirf = regr_multirf.predict(Xtrain)
pred_train_rf = regr_rf.predict(Xtrain)

# Predict on test data
pred_test_multirf = regr_multirf.predict(Xtest)
pred_test_rf = regr_rf.predict(Xtest)

print(mean_squared_error(ytrain, pred_train_multirf))
print(mean_squared_error(ytrain, pred_train_rf))
print('='*20)

print(mean_squared_error(ydev, pred_dev_multirf))
print(mean_squared_error(ydev, pred_dev_rf))
print('='*20)

print(mean_squared_error(ytest, pred_test_multirf))
print(mean_squared_error(ytest, pred_test_rf))

0.007517609340720407
0.01355654344966159
0.2427408399047468
0.30599999673645834
1.247933321003166
1.5341692445403476


In [16]:
HYP = 5

regr_multiknn = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=HYP))
# regr_multiknn.fit(Xtrain, ytrain)

regr_knn = KNeighborsRegressor(n_neighbors=HYP)
# regr_knn.fit(Xtrain, ytrain)

# Predict on new data
pred_dev_multiknn = regr_multiknn.predict(Xdev)
# pred_dev_knn = regr_knn.predict(Xdev)

# Predict on training data
pred_train_multiknn = regr_multiknn.predict(Xtrain)
# pred_train_knn = regr_knn.predict(Xtrain)

# Predict on test data
pred_test_multiknn = regr_multiknn.predict(Xtest)
# pred_test_knn = regr_knn.predict(Xtest)

print(mean_squared_error(ytrain, pred_train_multiknn))
# print(mean_squared_error(ytrain, pred_train_knn))
print('='*20)

print(mean_squared_error(ydev, pred_dev_multiknn))
# print(mean_squared_error(ydev, pred_dev_knn))
print('='*20)

print(mean_squared_error(ytest, pred_test_multiknn))
# print(mean_squared_error(ytest, pred_test_knn))

NotFittedError: This MultiOutputRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
model = regr_multilr

Xtest_show = Xtest[:3]
ytest_show = ytest[:3]

predictions = model.predict(Xtest_show)
print(mean_squared_error(ytest_show, predictions))


In [48]:
for i in range(len(ytest_show)):
    print('This is for test example {}'.format(i))
    
    n = Xtest_show.shape[1]
    predictions = model.predict(Xtest_show[i].reshape((1, n)))
    
    for j in range(len(ytest_show[i])):
        print('{}_true: {}'.format(output_cols[j], ytest_show[i][j]))
        print('{}_pred: {}'.format(output_cols[j], predictions[0][j]))
        print('='*50)
    
    print('*'*20)

This is for test example 0
AR_exchange_06_true: 2.35
AR_exchange_06_pred: 2.2022577871976274
AR_sharepoint_06_true: 1.0
AR_sharepoint_06_pred: 1.0393913915702737
AR_skype_06_true: 1.0
AR_skype_06_pred: -0.09751276050036277
AR_teams_06_true: 1.0
AR_teams_06_pred: 1.5933571860765217
AR_od4b_06_true: 1.0
AR_od4b_06_pred: 1.1086326724223692
AR_onenote_06_true: 0.0
AR_onenote_06_pred: 0.5920305879010814
AR_word_06_true: 1.0
AR_word_06_pred: 1.0320620400386928
AR_excel_06_true: 1.0
AR_excel_06_pred: 0.9986296307489022
AR_powerpoint_06_true: 0.0
AR_powerpoint_06_pred: 0.9507768976657778
AR_outlook_06_true: 1.0
AR_outlook_06_pred: 0.9413428254711472
AR_officelient_06_true: 1.0
AR_officelient_06_pred: 0.9975101210396461
AR_eslt_06_true: 2.375
AR_eslt_06_pred: 2.2126891323681024
********************
This is for test example 1
AR_exchange_06_true: 0.9545454545454546
AR_exchange_06_pred: 0.9329479104161074
AR_sharepoint_06_true: 0.0
AR_sharepoint_06_pred: 0.005770938053517624
AR_skype_06_true: 0.0

In [20]:
np.any(np.isnan(Xtrain_exo))

False

In [21]:
sc = StandardScaler()  
Xtrain_exo = sc.fit_transform(Xtrain_exo)  
# Xtest = sc.transform(Xtest)
Xdev_exo = sc.transform(Xdev_exo)

In [40]:
model = KNeighborsRegressor(n_neighbors=7)
# model = LinearRegression()
# model = RandomForestRegressor(n_estimators=5, random_state=0)

model.fit(Xtrain_exo, ytrain_exo)

pred_train = model.predict(Xtrain_exo)
print(mean_squared_error(ytrain_exo, pred_train))

pred_dev = model.predict(Xdev_exo)
print(mean_squared_error(ydev_exo, pred_dev))

0.33455920368214354
0.27741462628985025


In [43]:
# model = KNeighborsRegressor(n_neighbors=7)
# model = LinearRegression()
model = RandomForestRegressor(n_estimators=5, random_state=0)

model.fit(Xtrain_exo, ytrain_exo)

pred_train = model.predict(Xtrain_exo)
print(mean_squared_error(ytrain_exo, pred_train))

pred_dev = model.predict(Xdev_exo)
print(mean_squared_error(ydev_exo, pred_dev))


0.03357450083696344
0.008033447165090218


In [42]:
# model = KNeighborsRegressor(n_neighbors=7)
model = LinearRegression()
# model = RandomForestRegressor(n_estimators=5, random_state=0)

model.fit(Xtrain_exo, ytrain_exo)

pred_train = model.predict(Xtrain_exo)
print(mean_squared_error(ytrain_exo, pred_train))

pred_dev = model.predict(Xdev_exo)
print(mean_squared_error(ydev_exo, pred_dev))

0.004256127313895168
2.7955966633382052e+20


In [35]:
def hyp_tuning(model_name, Xtrain, ytrain, Xdev, ydev):
    mse_dev_dict = {}
    best_dev_mse = 1000
    for hyp in range(2, 30):
        
        print('='*20)
        print('starting to compute hyp={}'.format(hyp))
        
        if model_name == 'KNN':
            model = KNeighborsRegressor(n_neighbors=hyp)
        elif model_name == 'RF':
            model = RandomForestRegressor(n_estimators=hyp, random_state=0)
        else:
            raise ValueError('Unknown model_name')                               

        model.fit(Xtrain, ytrain)
        
        pred_train = model.predict(Xtrain_exo)
        mse_train = mean_squared_error(ytrain_exo, pred_train)

        pred_dev = model.predict(Xdev_exo)
        mse_dev = mean_squared_error(ydev_exo, pred_dev)
        
        if mse_dev < best_dev_mse:
            best_dev_mse = mse_dev
            print(hyp, best_dev_mse)
            mse_dev_dict[hyp] = best_dev_mse
            
            
    
    return mse_dev_dict
        
    

In [36]:
mse_KNN = hyp_tuning('KNN', Xtrain_exo, ytrain_exo, Xdev_exo, ydev_exo)

starting to compute hyp=2
2 0.303295558483608
starting to compute hyp=3
3 0.3020542355088155
starting to compute hyp=4
starting to compute hyp=5
5 0.3013890671612971
starting to compute hyp=6
6 0.2869898544553507
starting to compute hyp=7
7 0.27741462628985025
starting to compute hyp=8
starting to compute hyp=9
starting to compute hyp=10
starting to compute hyp=11
starting to compute hyp=12
starting to compute hyp=13
starting to compute hyp=14
starting to compute hyp=15
starting to compute hyp=16
starting to compute hyp=17
starting to compute hyp=18
starting to compute hyp=19
starting to compute hyp=20
starting to compute hyp=21
starting to compute hyp=22
starting to compute hyp=23
starting to compute hyp=24
starting to compute hyp=25
starting to compute hyp=26
starting to compute hyp=27
starting to compute hyp=28
starting to compute hyp=29


In [37]:
mse_RF = hyp_tuning('RF', Xtrain_exo, ytrain_exo, Xdev_exo, ydev_exo)

starting to compute hyp=2
2 0.0170847770374471
starting to compute hyp=3
3 0.013114965392635238
starting to compute hyp=4
starting to compute hyp=5
5 0.008033447165090218
starting to compute hyp=6
starting to compute hyp=7
starting to compute hyp=8
starting to compute hyp=9
starting to compute hyp=10
starting to compute hyp=11
starting to compute hyp=12
starting to compute hyp=13
starting to compute hyp=14
starting to compute hyp=15
starting to compute hyp=16
starting to compute hyp=17
starting to compute hyp=18
starting to compute hyp=19
starting to compute hyp=20
starting to compute hyp=21
starting to compute hyp=22
starting to compute hyp=23
starting to compute hyp=24
starting to compute hyp=25
starting to compute hyp=26
starting to compute hyp=27
starting to compute hyp=28
starting to compute hyp=29


In [38]:
mse_KNN

{2: 0.303295558483608,
 3: 0.3020542355088155,
 5: 0.3013890671612971,
 6: 0.2869898544553507,
 7: 0.27741462628985025}

In [39]:
mse_RF

{2: 0.0170847770374471, 3: 0.013114965392635238, 5: 0.008033447165090218}