In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack

import sys
import pickle
from pickle import dump
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import seaborn as sns
import joblib

######################
# # # # OTHERS # # # #
######################
# todo: split training set into a training and test set. Then split the training set into a training and validation sets
# todo: test all regressors and classifiers with regularization
# todo: test NN with regularization
# todo: select best models, create pipelines and combine them (ensemble learning)

# X = np.linspace(0, 1, 100)
# Y = np.linspace(0, 1, 100)
# f1 = []
# for i in range(100):
#     for j in range(100):
#         f1.append(2*(X[i]*Y[i])/(X[i]+Y[i]))
#
# fig = plt.figure()
# ax = plt.axes(projection='3d')
# C = X + Y
# ax.scatter(X, Y, f1, c=C)
# plt.xlabel('x')
# plt.ylabel('y')
# plt.show()

# count_vec = CountVectorizer()
# corpus = ["This is the first document, document one",
#           "This is the second document,",
#           "And this is the third one.",
#           "Is this the first document?"]
# X = count_vec.fit_transform(corpus)
# print("X = \n", X)
# print("X.type = \n", type(X))
# print("X.shape = \n", X.shape)
# print("X.toarray() = \n", X.toarray())
# print("count_vec.get_feature_names() = \n", count_vec.get_feature_names())
# print("corpus", corpus)


# # Tally occurrences of words in a list
# cnt = Counter()
# for word in ['red', 'blue', 'red', 'green', 'blue', 'blue']:
#     cnt[word] += 1
# print("cnt: ", cnt)  # Counter({'blue': 3, 'red': 2, 'green': 1})
#
# # pandas.DataFrame.replace
# s = pd.Series([0, 1, 2, 3, 4])
# df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
#                    'B': [5, 6, 3, 8, 9],
#                    'C': ['a, b, c', 'b', 'c', 'd', 'e']})
# print("s = \n", s)
# s.replace(0,5, inplace=True)  # Attention: ajouter inplace=True ou s = s.replace(0,5) sinon ça n'a aucun effet
# print("s = \n", s)
#
# print("df = \n", df)
# df.replace(0, 5, inplace=True)
# print("df = \n", df)
# df.replace([0, 1, 2, 3], 4, inplace=True)
# print("df = \n", df)
# df.replace([0, 1, 2, 3], [4, 3, 2, 1], inplace=True)
# print("df = \n", df)
# last = 'd'
# print(last)
# df.replace(last, 'COVID19_FV', inplace=True)
# print("df = \n", df)
#
# df2 = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
#                     'B': ['abc', 'bar', 'xyz']})
# print("df2 = \n", df2)
# df2.replace(to_replace=r'^xy.$', value='new', regex=True, inplace=True)
# print("df2 = \n", df2)

# sys.exit()






######################################
# # # # Load the training data # # # #
######################################
# train_data = pd.read_csv("../../iCloud Drive (archive)/Documents/Polytechnique_X/INF554/Project/train.csv")
train_data = pd.read_csv("train.csv")


######################################
# # # # AESTHETIC # # # #
######################################
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

sns.set(context="paper")


######################################
# # # # QUICK LOOK AT THE DATA # # # #
######################################
print("Dataset shape:", train_data.shape)
print("Dataset head:\n", train_data.head(50))
print("Dataset info:\n", train_data.info())
print("Dataset desciption:\n", train_data.describe())
"""
# train_data.hist(bins=50)  # bins=round(np.sqrt(train_data.shape[0]))
# plt.suptitle("Distribution of each numerical attribute", fontsize=15, weight='bold')
# plt.show()

plt.figure()
train_data['retweet_count'].hist(bins=50)  # bins=round(np.sqrt(train_data.shape[0]))
plt.title("Distribution of retweet_count", fontsize=15, weight='bold')
plt.xlabel("retweet_count")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()


# Distribution of retweet_count per slices
train_data_FV = train_data.copy()
train_data_FV["retweet_count_cat"] = pd.cut(train_data_FV["retweet_count"],
                                            bins=[0., 10., 20., 40., 60., 80.0, 110.0, 140., 180., 250., 300., 400., np.inf],
                                            labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
train_data_FV["retweet_count_cat"].hist()
plt.title("Distribution of retweet_count per categories", fontsize=15, weight='bold')
plt.xlabel("retweet_count_cat")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

print("Counts of unique values:\ntrain_data_FV['retweet_count_cat'].value_counts() =")
print(train_data_FV['retweet_count_cat'].value_counts())


# Pie Chart: Distribution of retweet_count per slices
label_classes = [1, 2, 3, 4, 5, 6, 7, 8]
label_names = ['0-20', '20-40.', '40-60', '60-80', '80-110', '110-140', '140-180', '180-inf']
sizes = [146234+22336, 18448, 18008, 8882, 5797, 5789, 4455, 4025+3683+3104+2213]
colors = ['cadetblue', 'orange', 'yellowgreen', 'indianred', 'mediumpurple', 'darkblue', 'darkgreen', 'darkred']
fig, ax = plt.subplots()
ax.pie(sizes, colors=colors, autopct='%1.1f%%', startangle=90)
ax.legend(label_names, title="retweet_count in:", loc="upper right")
ax.axis('equal')
plt.suptitle('Quantity of instances per classes', fontsize=15, weight='bold')
plt.title('Highlight of the imbalanced dataset', fontsize=10)
plt.tight_layout()
plt.show()
"""

######################################
# # # # DATA PREPROCESSING # # # #
######################################
train_data["user_mentions"] = train_data["user_mentions"].fillna('No user_mentions')
train_data["urls"] = train_data["urls"].fillna('No urls')
train_data["hashtags"] = train_data["hashtags"].fillna('No hashtag')

# # # # Text Attributes # # # #

# # user_mentions
# print("user_mentions head:\n", train_data["user_mentions"].head(10))
# # print("user_mentions:\n", train_data["user_mentions"].value_counts())
# train_data["user_mentions"] = train_data["user_mentions"].fillna('No user_mentions')
# print("user_mentions head:\n", train_data["user_mentions"].head(10))
# from sklearn.preprocessing import OneHotEncoder
# one_hot_encoder = OneHotEncoder()
# user_mentions_cat_encoded = one_hot_encoder.fit_transform(train_data[["user_mentions"]])
# print("user_mentions_cat_encoded =", user_mentions_cat_encoded)
#
# # urls
# print("urls head:\n", train_data["urls"].head(10))
# # print("urls:\n", train_data["urls"].value_counts())
# train_data["urls"] = train_data["urls"].fillna('No urls')
# print("urls head:\n", train_data["urls"].head(10))
# from sklearn.preprocessing import OneHotEncoder
# one_hot_encoder = OneHotEncoder()
# urls_cat_encoded = one_hot_encoder.fit_transform(train_data[["urls"]])
# print("urls_cat_encoded =", urls_cat_encoded)
#
# # hashtags
# print("Hashtags head:\n", train_data["hashtags"].head(10))
# # print("Hashtags:\n", train_data["hashtags"].value_counts())
# train_data["hashtags"] = train_data["hashtags"].fillna('No hashtag')
# print("Hashtags head:\n", train_data["hashtags"].head(10))
#
# # from sklearn.preprocessing import OrdinalEncoder
# # ordinal_encoder = OrdinalEncoder()
# # hashtags_cat_encoded = ordinal_encoder.fit_transform(train_data[["hashtags"]])
# # print(hashtags_cat_encoded[:10])
# # print(ordinal_encoder.categories_)
#
# from sklearn.preprocessing import OneHotEncoder
# one_hot_encoder = OneHotEncoder()
# hashtags_cat_encoded = one_hot_encoder.fit_transform(train_data[["hashtags"]])
# print("hashtags_cat_encoded =", hashtags_cat_encoded)
#
# # text
# tfidf_vect = TfidfVectorizer(max_features=100, stop_words='english')
# text_tfidf_vect = tfidf_vect.fit_transform(train_data[["text"]])

#X_train = vectorizer.fit_transform(X_train['text'])
#X_test = vectorizer.transform(X_test['text'])


# # # # DATA CLEANING # # # #
train_data["user_mentions"] = train_data["user_mentions"].fillna('No user_mentions')
train_data["urls"] = train_data["urls"].fillna('No urls')
train_data["hashtags"] = train_data["hashtags"].fillna('No hashtag')


###################################
# # # # FEATURE ENGINEERING # # # #
###################################

# # # # FEATURE ENGINEERING: timestamp # # # #
print("FEATURE ENGINEERING: timestamp --------------------")
pickel_in = open("train_data_timestamp_preprocessed.csv", "rb")
train_data_timestamp_preprocessed = pickle.load(pickel_in)


# # # # FEATURE ENGINEERING: hashtags # # # #
print("FEATURE ENGINEERING: hashtags --------------------")
pickel_in = open("train_data_hashtag_preprocessed.csv", "rb")
train_data_hashtag_preprocessed = pickle.load(pickel_in)



# # # # FULL PREPROCESSED DATASET # # # #
train_data = pd.concat([train_data_timestamp_preprocessed[["timestamp_transf_hour", "timestamp_transf_weekday"]],
                        train_data_hashtag_preprocessed[["hashtags_transf", "hashtags_count"]],
                        train_data[["retweet_count", "text", "user_verified", "user_statuses_count", "user_followers_count", "user_friends_count"]]],
                       axis=1)

print("train_data = \n", train_data.head(10))
print("train_data rc =", train_data["retweet_count"].head())


print("train_data = \n", train_data.head(10))

train_data['hashtags_transf'] = train_data['hashtags_transf'].apply(lambda x: ','.join(map(str, x)))  # todo: put this line in data_pre_hashatags before pickling the dataset



train_data = train_data[["timestamp_transf_hour", "timestamp_transf_weekday", "hashtags_count", "hashtags_transf", "user_verified", "user_statuses_count", "user_followers_count", "user_friends_count", "text", "retweet_count"]]
dump(train_data, open('train_data_preprocessed.csv', 'wb'))

sys.exit()

"""
# # # # PREPROCESSING PIPELINE # # # #

# Data Cleaning
train_data["user_mentions"] = train_data["user_mentions"].fillna('No user_mentions')
train_data["urls"] = train_data["urls"].fillna('No urls')
train_data["hashtags"] = train_data["hashtags"].fillna('No hashtag')

num_attributes = train_data.drop(["user_mentions", "urls", "hashtags", "text"], axis=1)
cat_attributes = train_data[["user_mentions", "urls", "hashtags"]].copy()
text_attributes = train_data["text"].copy()

#num_attribs = [["timestamp", "user_verified", "user_statuses_count", "user_followers_count", "user_friends_count"]]
num_attribs = list(num_attributes)
cat_attribs = ["user_mentions", "urls", "hashtags"]
text_attribs = "text"

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])
#num_attributes_FV = num_pipe.fit_transform(num_attributes)

cat_pipe = Pipeline([
    ('encoder', OneHotEncoder())
])
#cat_attributes_FV = cat_pipe.fit_transform(cat_attributes)

text_pipe = Pipeline([
    ('tfidf_vect', TfidfVectorizer(max_features=100, stop_words='english'))
])
#text_attributes_FV = text_pipe.fit_transform(text_attributes)

full_pipe = ColumnTransformer([
    ("num", num_pipe, num_attribs),
    ("cat", cat_pipe, cat_attribs),
    ("text", text_pipe, text_attribs)
])

print("-----------------------------------")
print("SHAPE OF train_data", train_data.shape)
print("-----------------------------------")
X_TRAIN_FV = full_pipe.fit_transform(train_data)
print("SHAPE OF X_TRAIN_FV", X_TRAIN_FV.shape)
print("type(X_TRAIN_FV) = ", type(X_TRAIN_FV))
print("-----------------------------------")


X_TRAIN_FV = pd.DataFrame(X_TRAIN_FV.toarray())
print("-----------------------------------")
print("SHAPE OF X_TRAIN_FV once pd.DF is applied", X_TRAIN_FV.shape)
print("type(X_TRAIN_FV) once pd.DF is applied", type(X_TRAIN_FV))
print("-----------------------------------")
X_TRAIN_FV["retweet_count"] = list(train_data["retweet_count"])
print("SHAPE OF X_TRAIN_FV once pd.DF is applied and retweet_count added", X_TRAIN_FV.shape)
print("type(X_TRAIN_FV) once pd.DF is applied and retweet_count added", type(X_TRAIN_FV))
print("-----------------------------------")
"""


"""
# # # # TARGET VALUES # # # #
target_transformed = -1/(train_data['retweet_count']+1)+1

plt.figure()
plt.suptitle("Distribution of the preprocessed target", fontsize=15, weight='bold')
plt.subplot(2, 2, 1)
train_data['retweet_count'].hist(bins=50)  # bins=round(np.sqrt(train_data.shape[0]))
plt.title("Distrib of retweet_count", fontsize=10, weight='bold')
plt.xlabel("retweet_count")
plt.ylabel("frequency")
plt.subplot(2, 2, 2)
plt.hist(target_transformed, bins=50, label='Inverse Transformed retweet_count')  # bins=round(np.sqrt(train_data.shape[0]))
plt.title("Distrib of -1/(1+retweet_count)+1", fontsize=10, weight='bold')
plt.xlabel("transf retweet_count")
plt.ylabel("frequency")
plt.subplot(2, 2, 3)
plt.hist(np.log1p(train_data['retweet_count']), bins=50, label='Natural Log-Transformed retweet_count')  # bins=round(np.sqrt(train_data.shape[0]))
plt.title("Distrib of Log(1+retweet_count)", fontsize=10, weight='bold')
plt.xlabel("transf retweet_count")
plt.ylabel("frequency")
plt.subplot(2, 2, 4)
plt.hist(np.log1p(np.log1p(train_data['retweet_count'])), bins=50, label='Natural Log-Transformed retweet_count')  # bins=round(np.sqrt(train_data.shape[0]))
plt.title("Distrib of Log(1+Log(1+retweet_count))", fontsize=10, weight='bold')
plt.xlabel("transf retweet_count")
plt.ylabel("frequency")
plt.tight_layout()
plt.show()



target_transformed = np.log1p(train_data['retweet_count'])
target_transformed = (target_transformed - np.mean(target_transformed))/float(np.std(target_transformed))

plt.figure()
plt.hist(target_transformed, bins=50, label='Log')  # bins=round(np.sqrt(train_data.shape[0]))
plt.title("Distribution of retweet_count", fontsize=15, weight='bold')
plt.legend()
plt.xlabel("Popularity")
plt.ylabel("Frequency")
plt.show()


target_transformed = -1/(train_data['retweet_count']+1)+1

plt.figure()
plt.hist(target_transformed, bins=50, label='Log')  # bins=round(np.sqrt(train_data.shape[0]))
plt.title("Distribution of retweet_count", fontsize=15, weight='bold')
plt.legend()
plt.xlabel("Popularity")
plt.ylabel("Frequency")
plt.show()



sys.exit()

"""



# # # # TAKE A SMALLER SUBSET OF THE ENTIRE DATASET (to speed up) # # # #

# train_data = train_data[["retweet_count", "user_verified", "user_statuses_count", "user_followers_count", "user_friends_count", "text", "timestamp_transf_hour", "timestamp_transf_weekday"]]
# train_data = train_data[["retweet_count", "text", "timestamp_transf_hour", "timestamp_transf_weekday", "hashtags_transf", "hashtags_count"]]

# train_data = train_data.head(int(len(train_data)/10))



#######################################
# # # # TRAINING SET - TEST SET # # # #
#######################################



# Here we split our training data into trainig and testing set. This way we can estimate the evaluation of our model without uploading to Kaggle and avoid overfitting over our evaluation dataset.
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], train_size=0.7, test_size=0.3)
print("-----------------------------------")
print("TRAINING DONE")
print("-----------------------------------")

print(X_train.head())
print(y_train.head())

# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweet_count'], axis=1)
X_test = X_test.drop(['retweet_count'], axis=1)


num_attribs = list(train_data[["user_verified", "timestamp_transf_hour", "timestamp_transf_weekday", "hashtags_count", "user_statuses_count", "user_followers_count", "user_friends_count"]])
text_attribs = "text"
bin_counting_nominal_cat_attribs = "hashtags_transf"


num_pipe = Pipeline([('std_scaler', StandardScaler())])
text_pipe = Pipeline([('tfidf_vect', TfidfVectorizer(max_features=100, stop_words='english'))])
bin_counting_nominal_cat_pipe = Pipeline([('count_vect', CountVectorizer(max_features=20))])

full_pipe = ColumnTransformer([
    ('num', num_pipe, num_attribs),
    ('text', text_pipe, text_attribs),
    ('bin_counting', bin_counting_nominal_cat_pipe, bin_counting_nominal_cat_attribs),
])

X_train = full_pipe.fit_transform(X_train)
X_test = full_pipe.transform(X_test)

#joblib.dump(X_train, "X_train.pkl")
#joblib.dump(X_test, "X_test.pkl")

#joblib.dump(y_train, "y_train.pkl")
#joblib.dump(y_test, "y_test.pkl")

print("SHAPE OF X_train", X_train.shape)
print("type(X_train) = ", type(X_train))
print("-----------------------------------")



###############
# # # # FLAVIEN
###############
# rajouter les engineered features aux sets prcq là j'entraines mes models uniquements sur train[text]

# # # # SELECT AND TRAIN MODELS # # # #
train_mae_scores = []
test_mae_scores = []

# Linear Regressor
print("Linear Regressor")

# Lasso Regressor
print("Lasso Regressor")

# Ridge Regressor
print("Ridge Regressor")

# Elastic Net Regressor
print("Elastic Net Regressor")

# GradientBoostingRegressor
print("GradientBoostingRegressor")
gb_reg = GradientBoostingRegressor(criterion='mse')
gb_reg.fit(X_train, y_train)
y_pred_gb_reg_train = gb_reg.predict(X_train)
y_pred_gb_reg_test = gb_reg.predict(X_test)
gb_reg_train_mae = mean_absolute_error(y_true=y_train, y_pred=y_pred_gb_reg_train)
gb_reg_test_mae = mean_absolute_error(y_true=y_test, y_pred=y_pred_gb_reg_test)
print("Prediction error:", gb_reg_train_mae, gb_reg_test_mae)
train_mae_scores.append(gb_reg_train_mae)
test_mae_scores.append(gb_reg_test_mae)

joblib.dump(gb_reg, "gb_reg.pkl")


# Decision Tree Regressor
print("DecisionTreeRegressor")
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(criterion='mse')
tree_reg.fit(X_train, y_train)
pred_tree_reg_train = tree_reg.predict(X_train)
pred_tree_reg_test = tree_reg.predict(X_test)
tree_reg_train_mae = mean_absolute_error(y_true=y_train, y_pred=pred_tree_reg_train)
tree_reg_test_mae = mean_absolute_error(y_true=y_test, y_pred=pred_tree_reg_test)
print("Dec Tree prediction error:", tree_reg_train_mae, tree_reg_test_mae)
train_mae_scores.append(tree_reg_train_mae)
test_mae_scores.append(tree_reg_test_mae)

joblib.dump(tree_reg, "tree_reg.pkl")


# Random Forest Regressor
print("RandomForestRegressor")
from sklearn.ensemble import RandomForestRegressor
rdf_reg = RandomForestRegressor(n_estimators=500,  criterion='mse', max_depth=5, max_leaf_nodes=16, n_jobs=-1)
rdf_reg.fit(X_train, y_train)
pred_rdf_reg_train = rdf_reg.predict(X_train)
pred_rdf_reg_test = rdf_reg.predict(X_test)
rdf_reg_train_mae = mean_absolute_error(y_true=y_train, y_pred=pred_rdf_reg_train)
rdf_reg_test_mae = mean_absolute_error(y_true=y_test, y_pred=pred_rdf_reg_test)
print("Rand For prediction error:", rdf_reg_train_mae, rdf_reg_test_mae)
train_mae_scores.append(rdf_reg_train_mae)
test_mae_scores.append(rdf_reg_test_mae)

# Logistic Regression
print("LogisticRegression")
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
pred_log_reg_train = log_reg.predict(X_train)
pred_log_reg_test = log_reg.predict(X_test)
log_reg_train_mae = mean_absolute_error(y_true=y_train, y_pred=pred_log_reg_train)
log_reg_test_mae = mean_absolute_error(y_true=y_test, y_pred=pred_log_reg_test)
print("Log Reg prediction error:", log_reg_train_mae, log_reg_test_mae)
train_mae_scores.append(log_reg_train_mae)
test_mae_scores.append(log_reg_test_mae)

# Linear SVC
print("LinearSVC")
from sklearn.svm import LinearSVC
linSVC = LinearSVC()
linSVC.fit(X_train, y_train)
pred_linSVC_train = linSVC.predict(X_train)
pred_linSVC_test = linSVC.predict(X_test)
linSVC_train_mae = mean_absolute_error(y_true=y_train, y_pred=pred_linSVC_train)
linSVC_test_mae = mean_absolute_error(y_true=y_test, y_pred=pred_linSVC_test)
print("Lin SVC prediction error:", linSVC_train_mae, linSVC_test_mae)
train_mae_scores.append(linSVC_train_mae)
test_mae_scores.append(linSVC_test_mae)


# # # # RANKING OF MODELS # # # #

estimators = ['GradientBoostingRegressor',
              'DecisionTreeRegressor',
              'RandomForestRegressor',
              'LogisticRegression',
              'LinearSVC']

bar_width = 0.10

fig, ax = plt.subplots()
index = np.arange(len(estimators))
training_scores1 = plt.barh(index, train_mae_scores, bar_width, color='darkred', alpha=0.6, label='Training Scores')
test_scores1 = plt.barh(index+bar_width, test_mae_scores, bar_width, color='darkgreen', alpha=0.6, label='Test Scores')
ax.set_title("Ranking of models by MAE scores", fontsize=15, weight='bold')
ax.set_xlabel('MAE')
ax.set_ylabel('Estimators')
ax.set_yticks(index+bar_width/2)
ax.set_yticklabels(estimators)
plt.legend()
plt.tight_layout()
plt.show()