In [4]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn.model_selection import train_test_split

import sys
import pickle
from pickle import dump
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import seaborn as sns
import joblib

import time
from sklearn.model_selection import RandomizedSearchCV


In [5]:
def get_data(training=-1,testing=-1,all_dataset=False):
    pickel_in = open("data/train_data_preprocessed.csv", "rb")
    train_data_prepro = pickle.load(pickel_in)
    
    pickel_in = open("data/evaluation_preprocessed.csv", "rb")
    eval_data_prepro = pickle.load(pickel_in)
#     eval_data_prepro = pd.read_csv("data/evaluation.csv",error_bad_lines=False)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)

    sns.set(context="paper")

    if not all_dataset:
#         X_train, X_test, y_train, y_test = scsplit(train_data_prepro, train_data_prepro['retweet_count'], stratify=train_data_prepro['retweet_count'], train_size=0.7, test_size=0.3)
        X_train, X_test, y_train, y_test = train_test_split(train_data_prepro, train_data_prepro['retweet_count'], train_size=0.7, test_size=0.3)
    
        if (training != -1):
            if testing == -1:
                testing = training
            X_train = X_train.head(training)
            X_test = X_test.head(testing)
            y_train = y_train.head(training)
            y_test = y_test.head(testing)
            
    else:
        X_train = train_data_prepro
        y_train = X_train['retweet_count']
        X_test = -1
        y_test = -1

    # We remove the actual number of retweets from our features since it is the value that we are trying to predict
    X_train = X_train.drop(['retweet_count'], axis=1)
    
    if not all_dataset:
        X_test = X_test.drop(['retweet_count'], axis=1)

    num_attribs = list(train_data_prepro[["user_verified", "timestamp_transf_hour", "timestamp_transf_weekday", "hashtags_count", "user_statuses_count", "user_followers_count", "user_friends_count"]])
    text_attribs = "text"
    bin_counting_nominal_cat_attribs = "hashtags_transf"


    num_pipe = Pipeline([('std_scaler', StandardScaler())])
    text_pipe = Pipeline([('tfidf_vect', TfidfVectorizer(max_features=25, stop_words='english'))])
    bin_counting_nominal_cat_pipe = Pipeline([('count_vect', CountVectorizer(max_features=10))])

    full_pipe = ColumnTransformer([
        ('num', num_pipe, num_attribs),
        ('text', text_pipe, text_attribs),
        ('bin_counting', bin_counting_nominal_cat_pipe, bin_counting_nominal_cat_attribs),
    ])

    X_train = full_pipe.fit_transform(X_train)
    if not all_dataset:
        X_test = full_pipe.transform(X_test)
    X_eval = full_pipe.transform(eval_data_prepro)

    print("SHAPE OF X_train", X_train.shape)
    print("type(X_train) = ", type(X_train))
    print("-----------------------------------")
    print("SHAPE OF y_train", y_train.shape)
    print("-----------------------------------")
    return X_train, X_test, y_train, y_test, X_eval

In [6]:
def train(model,X_train,y_train):
    start_time = time.time()

    model.fit(X_train, y_train)

    print("--- %s minutes ---" % ((time.time() - start_time)/60))

    return model

In [28]:
def predict(model, print_features = False, all_dataset = False):  
    pred_model_train = model.predict(X_train)
    model_train_mae = mean_absolute_error(y_true=y_train, y_pred=pred_model_train)        
    print("Logistic Regression prediction error for training set: ", model_train_mae) 
    if not all_dataset:
        pred_model_test = model.predict(X_test)
        model_test_mae = mean_absolute_error(y_true=y_test, y_pred=pred_model_test)
        print("for testing set: ", model_test_mae)
    
    
    if print_features:
        #importances = log_reg.feature_importances_
        importance = model.coef_[0]
        for i,v in enumerate(importance):
            print('Feature: %0d, Score: %.5f' % (i,v))
        # plot feature importance
        plt.bar([x for x in range(len(importance))], importance)
        plt.show()
        
    return model_train_mae, model_test_mae

In [84]:
X_train, X_test, y_train, y_test, X_eval = get_data(training = -1, testing = -1, all_dataset=False)
print(X_test == -1)

SHAPE OF X_train (466043, 42)
type(X_train) =  <class 'scipy.sparse.csr.csr_matrix'>
-----------------------------------
SHAPE OF y_train (466043,)
-----------------------------------



In [85]:
# # # # SELECT AND TRAIN MODELS # # # #
train_mae_scores = []
test_mae_scores = []

'''
# Linear Regressor
print("Linear Regressor")

# Lasso Regressor
print("Lasso Regressor")

# Ridge Regressor
print("Ridge Regressor")

# Elastic Net Regressor
print("Elastic Net Regressor")
'''

'\n# Linear Regressor\nprint("Linear Regressor")\n\n# Lasso Regressor\nprint("Lasso Regressor")\n\n# Ridge Regressor\nprint("Ridge Regressor")\n\n# Elastic Net Regressor\nprint("Elastic Net Regressor")\n'

# Linear SVC

In [None]:
linSVC = LinearSVC()
train(linSVC,X_train,y_train)

joblib.dump(linSVC, "linSVC.pkl")

In [None]:
linSVC_train_mae, linSVC_test_mae = predict(linSVC, print_features=False, all_dataset=False)

# Random Forest

In [86]:
rdf_reg = RandomForestRegressor(bootstrap=False, max_depth=70, max_features='sqrt',
                      min_samples_leaf=4, min_samples_split=10,
                      n_estimators=2000, n_jobs=-1,scoring = "neg_mean_absolute_error")
train(rdf_reg,X_train,y_train)

joblib.dump(rdf_reg, "rdf_reg.pkl")

--- 76.61278418302535 minutes ---


['rdf_reg.pkl']

In [87]:
rdf_reg_train_mae, rdf_reg_test_mae = predict(rdf_reg, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  180.9097614721928
for testing set:  221.86609252172846


In [77]:
import pprint
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 3)]
# Number of features to consider at every split
max_features = ['sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [1000, 1500, 2000], 'max_features': ['sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [36]:
start_time = time.time()

rf = RandomForestRegressor(n_jobs = -1)

rf_random1 = RandomizedSearchCV(scoring = "neg_mean_absolute_error",estimator = rf, param_distributions = random_grid, verbose = 3, n_iter = 25, cv = 2, random_state=42, n_jobs = -1)
# Fit the random search model
train(rf_random,X_train, y_train)

print("--- Total time: %s minutes ---" % ((time.time() - start_time)/60))

Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 46.8min finished


--- 47.61357934077581 minutes ---
--- Total time: 47.614506125450134 minutes ---


In [55]:
print(rf_random1.best_estimator_)
predict(rf_random1.best_estimator_,print_features=False,all_dataset=False)

RandomForestRegressor(bootstrap=False, max_depth=70, max_features='sqrt',
                      min_samples_leaf=4, min_samples_split=10,
                      n_estimators=2000, n_jobs=-1)
Logistic Regression prediction error for training set:  175.7325555366603
for testing set:  228.37667239954237


(175.7325555366603, 228.37667239954237)

In [76]:
for i in range(1,50):
    index = np.where(rf_random.cv_results_['rank_test_score'] == i)[0][0]
    print(index)
    print('score: ',rf_random.cv_results_['mean_test_score'][index])
#     print(predict(rf_random.cv_results_['params'][i]))
    print(rf_random.cv_results_['params'][index])

2
score:  0.04065992886126546
{'n_estimators': 2000, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False}
30
score:  0.04035516113341564
{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 70, 'bootstrap': False}
28
score:  0.04014696487325975
{'n_estimators': 2000, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}
42
score:  0.03997506938652595
{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
23
score:  0.03977353722220406
{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
19
score:  0.03974387163898546
{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
45
score:  0.03903964961895018
{'n

# Decision Tree

In [None]:
tree_reg = DecisionTreeRegressor(criterion='mse')
train(tree_reg, X_train,y_train)

joblib.dump(tree_reg, "tree_reg.pkl")

In [9]:
tree_reg_train_mae, tree_reg_test_mae = predict(tree_reg, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  0.1352853856001227
for testing set:  262.85804610224653


# Gradient Boosting

In [6]:
gb_reg = GradientBoostingRegressor(criterion='mse')
train(gb_reg, X_train,y_train)

joblib.dump(gb_reg, "gb_reg.pkl")

--- 0.8925456086794535 minutes ---


['gb_reg.pkl']

In [7]:
gb_reg_train_mae, gb_reg_test_mae = predict(gb_reg, print_features=False, all_dataset=False)

Logistic Regression prediction error for training set:  220.47702797434127
for testing set:  222.91790133523364


In [None]:
# # # # RANKING OF MODELS # # # #

train_mae_scores.append(linSVC_train_mae)
test_mae_scores.append(linSVC_test_mae)

train_mae_scores.append(rdf_reg_train_mae)
test_mae_scores.append(rdf_reg_test_mae)

train_mae_scores.append(tree_reg_train_mae)
test_mae_scores.append(tree_reg_test_mae)

train_mae_scores.append(gb_reg_train_mae)
test_mae_scores.append(gb_reg_test_mae)

estimators = ['GradientBoostingRegressor',
              'DecisionTreeRegressor',
              'RandomForestRegressor',
              'LinearSVC']

bar_width = 0.10

fig, ax = plt.subplots()
index = np.arange(len(estimators))
training_scores1 = plt.barh(index, train_mae_scores, bar_width, color='darkred', alpha=0.6, label='Training Scores')
test_scores1 = plt.barh(index+bar_width, test_mae_scores, bar_width, color='darkgreen', alpha=0.6, label='Test Scores')
ax.set_title("Ranking of models by MAE scores", fontsize=15, weight='bold')
ax.set_xlabel('MAE')
ax.set_ylabel('Estimators')
ax.set_yticks(index+bar_width/2)
ax.set_yticklabels(estimators)
plt.legend()
plt.tight_layout()
plt.show()