In [77]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn.model_selection import train_test_split

import sys
import pickle
from pickle import dump
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import seaborn as sns
import joblib
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

import time

In [147]:
def get_data(training=-1,testing=-1,all_dataset=False):
    train_data = pd.read_csv("data/train.csv")
    eval_data = pd.read_csv("data/evaluation.csv")
    pickel_in = open("data/train_data_preprocessed2.csv", "rb")
#     pickel_in = open("data/train_data_preprocessed2_augmented.csv", "rb")
    train_data_prepro = pickle.load(pickel_in)

    train_data_prepro = pd.concat([train_data_prepro,
                                     train_data[["timestamp"]]],
                                    axis=1)
    
    pickel_in = open("data/evaluation_preprocessed2.csv", "rb")
    eval_data_prepro = pickle.load(pickel_in)
    
    eval_data_prepro = pd.concat([eval_data_prepro,
                                     eval_data[["timestamp"]]],
                                    axis=1)
    
    eval_data_prepro = eval_data_prepro.drop(['id','timestamp_transf_hour',"timestamp_transf_weekday"],axis=1)
    train_data_prepro = train_data_prepro.drop(['timestamp_transf_hour',"timestamp_transf_weekday"],axis=1)
    
#     print(eval_data_prepro.head(10))
#     print(train_data_prepro.head(10))

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    sns.set(context="paper")

    if not all_dataset:
#         X_train, X_test, y_train, y_test = scsplit(train_data_prepro, train_data_prepro['retweet_count'], stratify=train_data_prepro['retweet_count'], train_size=0.7, test_size=0.3)
        X_train, X_test, y_train, y_test = train_test_split(train_data_prepro, train_data_prepro['retweet_count'], train_size=0.80, test_size=0.20)
    
        if (training != -1):
            if testing == -1:
                testing = training
            X_train = X_train.head(training)
            X_test = X_test.head(testing)
            y_train = y_train.head(training)
            y_test = y_test.head(testing)
            
    else:
        X_train = train_data_prepro
        y_train = X_train['retweet_count']
        X_test = -1
        y_test = -1

    # We remove the actual number of retweets from our features since it is the value that we are trying to predict
    X_train = X_train.drop(['retweet_count'], axis=1)
    
    if not all_dataset:
        X_test = X_test.drop(['retweet_count'], axis=1)

    num_attribs = list(train_data_prepro[["user_verified", 
                                          "timestamp",
#                                           "timestamp_transf_hour", 
#                                           "timestamp_transf_weekday", 
                                          "hashtags_count",
                                          "user_statuses_count", 
                                          "user_followers_count", 
                                          "user_friends_count",
                                          "user_mentions_transf",
                                          "urls_transf",
                                          "text_length"]])
    text_attribs = "text"
    bin_counting_nominal_cat_attribs = "hashtags_transf"


    num_pipe = Pipeline([('std_scaler', StandardScaler())])
    text_pipe = Pipeline([('tfidf_vect', TfidfVectorizer(max_features=100, stop_words='english'))])
    bin_counting_nominal_cat_pipe = Pipeline([('count_vect', CountVectorizer(max_features=50))])

    full_pipe = ColumnTransformer([
        ('num', num_pipe, num_attribs),
        ('text', text_pipe, text_attribs),
        ('bin_counting', bin_counting_nominal_cat_pipe, bin_counting_nominal_cat_attribs),
    ])

    X_train = full_pipe.fit_transform(X_train)
    if not all_dataset:
        X_test = full_pipe.transform(X_test)
    X_eval = full_pipe.transform(eval_data_prepro)
#     X_eval = -1
    y_train = np.log(y_train+1)
#     y_test = np.log(y_test+1)
    
    print("SHAPE OF X_train", X_train.shape)
    print("type(X_train) = ", type(X_train))
    print("-----------------------------------")
    print("SHAPE OF y_train", y_train.shape)
    print("-----------------------------------")
    return X_train, X_test, y_train, y_test, X_eval


def train(model,X_train,y_train):
    print("Start training")
    start_time = time.time()
    
    model.fit(X_train, y_train)
    print("--- %s minutes ---" % ((time.time() - start_time)/60))
    return model
    
def predict(model, print_features = False, all_dataset = False):  
    pred_model_train = model.predict(X_train)
    model_train_mae = mean_absolute_error(y_true=np.exp(y_train)-1, y_pred=np.exp(pred_model_train)-1)        
    print("Logistic Regression prediction error for training set: ", model_train_mae) 
    if not all_dataset:
        pred_model_test = np.exp(model.predict(X_test))-1
        model_test_mae = mean_absolute_error(y_true=y_test, y_pred=pred_model_test)
        print("for testing set: ", model_test_mae)
    
    
    if print_features:
        #importances = log_reg.feature_importances_
        importance = log_reg.coef_[0]
        for i,v in enumerate(importance):
            print('Feature: %0d, Score: %.5f' % (i,v))
        # plot feature importance
        plt.bar([x for x in range(len(importance))], importance)
        plt.show()

In [148]:
X_train, X_test, y_train, y_test, X_eval = get_data(training = -1,all_dataset=False)
print(X_test == -1)

SHAPE OF X_train (532621, 159)
type(X_train) =  <class 'scipy.sparse.csr.csr_matrix'>
-----------------------------------
SHAPE OF y_train (532621,)
-----------------------------------



In [149]:
print(y_train.shape)

(532621,)


In [14]:
log_reg = LogisticRegression(multi_class='ovr',n_jobs=-1)
log_reg = train(log_reg,X_train,y_train)
joblib.dump(log_reg, "LogisticRegression.pkl")

LogisticRegression
--- 38.44895598491033 minutes ---


In [30]:
predict(log_reg,print_features=True,all_dataset=False)

NameError: name 'log_reg' is not defined

In [90]:
# Dump the results into a file that follows the required Kaggle template
eval_data = pd.read_csv("data/evaluation.csv",error_bad_lines=False)

def write_file(model, title = "LogisticRegression(42_features)_all_dataset"):
    y_pred = np.exp(model.predict(X_eval))-1
    with open(title + ".txt", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["TweetID", "NoRetweets"])
        for index, prediction in enumerate(y_pred):
            writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])

In [67]:
# write_file()

# kNN

In [75]:
for i in [2,5,10,25]:
    neigh = KNeighborsRegressor(n_neighbors=i,n_jobs=-1)
    model = train(neigh,X_train,y_train)
    title = "neighbors_k" + str(i) + ".pkl"
    joblib.dump(model, title)
    predict(model,print_features=False,all_dataset=False)

# NN

In [154]:
regr = MLPRegressor(hidden_layer_sizes=(32,128,64,32,16,8,8),max_iter=200,tol = 0.01,n_iter_no_change = 20,learning_rate = 'adaptive',early_stopping=True,validation_fraction = 0.15,verbose=1)
model = train(regr,X_train,y_train)
joblib.dump(model, "nn_regression.pkl")

Start training
Iteration 1, loss = 0.71525302
Validation score: 0.612357
Iteration 2, loss = 0.55981879
Validation score: 0.630626
Iteration 3, loss = 0.54151346
Validation score: 0.636639
Iteration 4, loss = 0.53317662
Validation score: 0.632802
Iteration 5, loss = 0.52697728
Validation score: 0.640647
Iteration 6, loss = 0.52202607
Validation score: 0.645201
Iteration 7, loss = 0.51773980
Validation score: 0.646248
Iteration 8, loss = 0.51442878
Validation score: 0.642896
Iteration 9, loss = 0.51252236
Validation score: 0.645573
Iteration 10, loss = 0.50891957
Validation score: 0.646650
Iteration 11, loss = 0.50694261
Validation score: 0.643897
Iteration 12, loss = 0.50430216
Validation score: 0.646754
Iteration 13, loss = 0.50269000
Validation score: 0.646117
Iteration 14, loss = 0.50085603
Validation score: 0.645358
Iteration 15, loss = 0.49844414
Validation score: 0.645141
Iteration 16, loss = 0.49680232
Validation score: 0.648182
Iteration 17, loss = 0.49545827
Validation score: 

['nn_regression.pkl']

In [155]:
predict(model,print_features=False,all_dataset=False)

Logistic Regression prediction error for training set:  137.53091620194544
for testing set:  155.64408649394656


In [150]:
regr2 = MLPRegressor(hidden_layer_sizes=(256,128,32),max_iter=200,learning_rate = 'adaptive',early_stopping=True,verbose=1,tol = 0.001,validation_fraction = 0.2,n_iter_no_change = 10)
model2 = train(regr2,X_train,y_train)
joblib.dump(model2, "nn_regression2.pkl")

Start training
Iteration 1, loss = 0.61329526
Validation score: 0.621881
Iteration 2, loss = 0.55294124
Validation score: 0.632802
Iteration 3, loss = 0.53498994
Validation score: 0.633081
Iteration 4, loss = 0.52287647
Validation score: 0.638751
Iteration 5, loss = 0.51343104
Validation score: 0.639497
Iteration 6, loss = 0.50531565
Validation score: 0.642689
Iteration 7, loss = 0.49833320
Validation score: 0.643583
Iteration 8, loss = 0.49151834
Validation score: 0.642395
Iteration 9, loss = 0.48413198
Validation score: 0.642404
Iteration 10, loss = 0.47781648
Validation score: 0.640918
Iteration 11, loss = 0.46990553
Validation score: 0.637020
Iteration 12, loss = 0.46228186
Validation score: 0.636018
Iteration 13, loss = 0.45469004
Validation score: 0.636781
Iteration 14, loss = 0.44812088
Validation score: 0.626416
Iteration 15, loss = 0.44044143
Validation score: 0.626521
Iteration 16, loss = 0.43334028
Validation score: 0.626205
Iteration 17, loss = 0.42701989
Validation score: 

['nn_regression2.pkl']

In [151]:
predict(model2,print_features=False,all_dataset=False)

Logistic Regression prediction error for training set:  146.84982640875995
for testing set:  168.53574028673216


In [None]:
# regr3 = MLPRegressor(hidden_layer_sizes=(64,128,128,64,32,16,8),max_iter=200,learning_rate = 'adaptive',early_stopping=True,verbose=1,tol = 0.00001,n_iter_no_change = 15)
# model3 = train(regr3,X_train,y_train)
# joblib.dump(model3, "nn_regression3.pkl")

In [37]:
predict(model3,print_features=False,all_dataset=False)

Logistic Regression prediction error for training set:  138.76219023105082
for testing set:  146.46251520992584


In [21]:
regr4 = MLPRegressor(hidden_layer_sizes=(64,128,64),max_iter=100,learning_rate = 'adaptive',validation_fraction = 0.2,early_stopping=True,verbose=1,tol = 0.00001,n_iter_no_change = 20)
model4 = train(regr4,X_train,y_train)
joblib.dump(model4, "nn_regression4.pkl")

Start training
Iteration 1, loss = 0.61292405
Validation score: 0.618130
--- 0.12340400616327922 minutes ---


['nn_regression4.pkl']

In [49]:
predict(model4,print_features=False,all_dataset=False)

Logistic Regression prediction error for training set:  184.84210778608414
for testing set:  182.41097344110497


In [81]:
write_file(model3,title="NeuralNetworkAugmented_All_Dataset")

# Random Forest

In [100]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 3)]
# Number of features to consider at every split
max_features = ['sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 150, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [1000, 1500, 2000], 'max_features': ['sqrt', 'log2'], 'max_depth': [10, 24, 38, 52, 66, 80, 94, 108, 122, 136, 150, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# start_time = time.time()

# rf = RandomForestRegressor(n_jobs = -1)

# rf_random = RandomizedSearchCV(scoring = "neg_mean_absolute_error",estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, random_state=42, n_jobs = -1)
# # Fit the random search model
# train(rf_random,X_train, y_train)

# print("--- Total time: %s minutes ---" % ((time.time() - start_time)/60))

Start training


In [107]:
print(rf_random.best_estimator_)
predict(rf_random.best_estimator_,print_features=False,all_dataset=False)

NameError: name 'rf_random' is not defined

In [108]:
rdf_reg = RandomForestRegressor(max_depth=5, max_leaf_nodes = 16, criterion = 'mse',
                      n_estimators=500, n_jobs=-1)
train(rdf_reg,X_train,y_train)

joblib.dump(rdf_reg, "rdf_reg.pkl")

Start training
--- 3.605497244993846 minutes ---


['rdf_reg.pkl']

In [109]:
predict(rdf_reg, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  143.39218311079114
for testing set:  150.94908086717808


In [101]:
write_file(rdf_reg,title="random_forest")

# Gradient Boosting

In [110]:
gb_reg = GradientBoostingRegressor(criterion='mse', max_depth = 10, n_estimators = 100)
train(gb_reg, X_train,y_train)

joblib.dump(gb_reg, "gb_reg.pkl")

Start training
--- 8.291369120279947 minutes ---


['gb_reg.pkl']

In [111]:
predict(gb_reg, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  126.75270591324156
for testing set:  143.45199128104338


In [95]:
write_file(gb_reg,title="Gradient_Boosting")

### Grid Search

In [129]:
param_grid = [
    {'max_depth': [7,12], 'n_estimators': [85,115], 'learning_rate': [0.1]}
]

In [130]:
start_time = time.time()

gb_reg_search = GradientBoostingRegressor()
grid_search = GridSearchCV(gb_reg_search, param_grid, cv=2, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("--- Total time: %s minutes ---" % ((time.time() - start_time)/60))

--- Total time: 12.837208513418833 minutes ---


In [131]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)
cvres = grid_search.cv_results_
learn001 = []
learn01 = []
learn05 = []
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    if params['learning_rate'] == 0.01:
        learn001.append(mean_score)
    if params['learning_rate'] == 0.1:
        learn01.append(mean_score)
    if params['learning_rate'] == 0.5:
        learn05.append(mean_score)
    print(mean_score, params)


print(np.mean(learn001), np.std(learn001))
print(np.mean(learn01))
print(np.std(learn01))
print(np.mean(learn05))
print(np.std(learn05))
predict(grid_search.best_estimator_,print_features=False,all_dataset=False)

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 115}
GradientBoostingRegressor(max_depth=7, n_estimators=115)
0.6733031155491349 {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 85}
0.6749588737038559 {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 115}
0.6720478763024451 {'learning_rate': 0.1, 'max_depth': 12, 'n_estimators': 85}
0.6722412976369028 {'learning_rate': 0.1, 'max_depth': 12, 'n_estimators': 115}
nan nan
0.6731377907980847
0.0011549122064959177
nan
nan
Logistic Regression prediction error for training set:  133.2678600133127
for testing set:  147.36643766382008


# XGBoost

In [112]:
xgb_reg = xgboost.XGBRegressor(nthread=-1)
xgb_reg.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)

[0]	validation_0-rmse:2584.63306
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:2584.59595
[2]	validation_0-rmse:2584.56982
[3]	validation_0-rmse:2584.55029
[4]	validation_0-rmse:2584.53809
[5]	validation_0-rmse:2584.52783
[6]	validation_0-rmse:2584.52051
[7]	validation_0-rmse:2584.51440
[8]	validation_0-rmse:2584.50855
[9]	validation_0-rmse:2584.50537
[10]	validation_0-rmse:2584.50195
[11]	validation_0-rmse:2584.49902
[12]	validation_0-rmse:2584.49731
[13]	validation_0-rmse:2584.49609
[14]	validation_0-rmse:2584.49414
[15]	validation_0-rmse:2584.49365
[16]	validation_0-rmse:2584.49243
[17]	validation_0-rmse:2584.49170
[18]	validation_0-rmse:2584.49072
[19]	validation_0-rmse:2584.48901
[20]	validation_0-rmse:2584.48877
[21]	validation_0-rmse:2584.48828
[22]	validation_0-rmse:2584.48755
[23]	validation_0-rmse:2584.48755
[24]	validation_0-rmse:2584.48731
[25]	validation_0-rmse:2584.48706
[26]	validation_0-rmse:2584.48682
[27]	validation_0-rmse:2584.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=-1, nthread=-1, num_parallel_tree=1,
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [113]:
predict(xgb_reg, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  135.12797181465436
for testing set:  145.6330989741252


# Voting

### First try

In [114]:
from sklearn.ensemble import VotingRegressor

# best_gb_regr = GradientBoostingRegressor(learning_rate=0.1,max_depth=10,n_estimators=100)

voting_reg0 = VotingRegressor(
    estimators=[('gb_reg', gb_reg), ('xgb_reg', xgb_reg),('nn2',model2),('rdf',rdf_reg)], n_jobs = -1
)

voting_reg0.fit(X_train, y_train)

VotingRegressor(estimators=[('gb_reg',
                             GradientBoostingRegressor(criterion='mse',
                                                       max_depth=10)),
                            ('xgb_reg',
                             XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1...
                                          reg_lambda=1, scale_pos_weight=1,
                                          subsample=1, tree_method='exact',
 

In [116]:
predict(voting_reg0, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  138.00320024996174
for testing set:  146.75392498476893


### Second try

In [137]:
start_time = time.time()

voting_MLP = MLPRegressor(hidden_layer_sizes=(64,32,16,8),max_iter=200,learning_rate = 'adaptive',early_stopping=True,verbose=1,tol = 0.000001,validation_fraction = 0.15,n_iter_no_change = 20)
voting_XGBR = xgboost.XGBRegressor(nthread=-1)
voting_GBR = GradientBoostingRegressor(criterion='mse', max_depth = 7, n_estimators = 100, n_iter_no_change=5,)
rdf_reg = RandomForestRegressor(max_depth=50, max_leaf_nodes = 16, criterion = 'mse',
                      n_estimators=1000, n_jobs=-1, min_samples_split = 4,max_features='sqrt')

voting_reg = VotingRegressor(
    estimators=[('gb_reg', voting_GBR), ('xgb_reg', voting_XGBR),('nn2',voting_MLP),('rdf',rdf_reg)], n_jobs = -1
)

voting_reg.fit(X_train, y_train)

print("--- Total time: %s minutes ---" % ((time.time() - start_time)/60))

--- Total time: 5.383676473299662 minutes ---


In [138]:
predict(voting_reg, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  139.58611859535534
for testing set:  151.61104923322347


In [118]:
write_file(voting_reg0,title="Voting Regression with MLP, XGBR, GBR and RDF")