Notebook produced for the Assignment of the Data Science Lab course at Politecnico di Torino by 

- Francesco Capuano, s295366


# Importations 

In [None]:
import datatable as dt
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, make_scorer, plot_confusion_matrix
import pprint

from utils import *
tqdm.pandas()

rs = 23042000
np.random.seed(rs)

main_PATH = os.getcwd()
os.chdir(main_PATH + "/data")
data_PATH = os.getcwd()

#importing the data
dev = dt.fread("development.csv").to_pandas()
ev = dt.fread("evaluation.csv").to_pandas()

os.chdir(main_PATH)

#replacing the sentiment 
dev.sentiment.replace([True, False], [1,0], inplace = True)

In [None]:
print("***** NEEDED PACKAGES *****")
print("datatable version:", dt.__version__)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("seaborn version:", sns.__version__)
import matplotlib
print("matplotlib version:", matplotlib.__version__)
import sklearn
print("scikit-learn version:", sklearn.__version__)

# Data Exploration and Data Cleaning

In [None]:
dev.flag.value_counts()
#the only value for flag field is "NO_QUERY"

In [None]:
#obtaining the number of characters each tweet is composed
dev["NumberOfChars"] = dev.text.progress_apply(len)

#concluding whether or not one index present a tweet that is exceeding the 140 chrs limit. 
dev["ExceedsBound"] = dev.NumberOfChars.progress_apply(lambda x: 1 if x > 140 else 0)

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (6,4))

#semantic informations
plt.subplot(1,2,1)
ax[0] = sns.histplot(data = dev.ExceedsBound, stat = "count")

ax[0].set_yscale("log")

ax[0].set_xticks([0,1])
ax[0].set_yticks([2538, 222456])

ax[0].set_xticklabels(["length consistent", "length exceeding"], fontdict = {"fontsize": 12}, rotation = 10)
ax[0].set_yticklabels(["2538", "222456"],  fontdict = {"fontsize": 12}, rotation = 10)

ax[0].set_xlabel("")
ax[0].set_ylabel("Number of tweets")

ax[0].grid(which = "both")

#statistic information
plt.subplot(1,2,2)
ax[1] = sns.boxplot(data = dev.NumberOfChars)
ax[1].set_xlabel("")

secax = ax[1].secondary_yaxis("right")
secax.set_yticks([
    dev.NumberOfChars.describe()["25%"], 
    dev.NumberOfChars.describe()["50%"], 
    dev.NumberOfChars.describe()["75%"], 
    140]
)
secax.set_yticklabels(["25%", "50%", "75%", "140 chrs bound"],  fontdict = {"fontsize": 10})

ax[1].axhline(140, color = "red", linestyle = "--", linewidth = 1.25)
ax[1].set_xticks([])
fig.tight_layout()

fig.savefig("NumberOfChars.svg")

With respect to the number of words, even if the value "140" is stastically valid (as shown in the left part of Figure (NumberOfChars), it is important to notice that it is not semantically correct, considering the strict bound of the number of characters imposed by Twitter back in 2009. Hence, all the 2538 tweets that are exceding this bound should be removed as they represent data points that are non consistent with the domain of interest.

In [None]:
#dropping the dirty data points
tooLongIdx = dev[dev["ExceedsBound"] == 1].index
dev.drop(index = tooLongIdx, inplace = True)

In [None]:
#building up a dataframe containing informations on the Positive/Negative ratio per User.
PosPerUser = dev.loc[:, ["user", "sentiment"]].groupby(by = "user").sum()
TotPerUser = dev.loc[:, ["user", "sentiment"]].groupby(by = "user").count()
SntPerUser = pd.concat([PosPerUser, TotPerUser], axis = 1)

SntPerUser.columns = ["PositiveTweets", "TotalTweets"]
SntPerUser["PosRatio"] = SntPerUser.apply(lambda x: x["PositiveTweets"]/x["TotalTweets"], axis = 1)

#characterizing a user based on the fraction of positive tweets he/she posts. 
SntPerUser["ExtremeOrNot"] = SntPerUser.PosRatio.progress_apply(lambda x: 0.75 if (x < 0.10 or x > 0.90) else 0.25)

In [None]:
fig, ax = plt.subplots(1,2, figsize = (6,4))
plt.subplot(1,2,1)
ax[0] = SntPerUser.PosRatio.hist(bins = 50)
ax[0].set_title("Distribution of Positive-Tweetters", fontdict = {"fontsize": 10})
ax[0].set_xlabel("Percentage of Positive Tweets", fontdict = {"fontsize": 12})
ax[0].set_ylabel("Number of users", fontdict = {"fontsize": 12})
ax[0].grid(True)

plt.subplot(1,2,2)
ax[1] = SntPerUser.ExtremeOrNot.hist()
ax[1].set_title("Distribution of Highly polarized-Tweetters", fontdict = {"fontsize": 10})
ax[1].set_xlabel("Twitter Users", fontdict = {"fontsize": 12})
ax[1].set_xticks([0.25,0.75])
ax[1].set_xticklabels(["Regular Users", "Highly Polarized"])
ax[1].set_ylabel("Number of users", fontdict = {"fontsize": 12})

fig.tight_layout()
fig.savefig("PosTwet.svg")

In [None]:
print("The mean Positive/Negative ratio is {:.4f}".format(SntPerUser.PosRatio.describe()['mean']))

The user column should be discarded because classification based on the user goes against the statistics related to the fraction of positive (or negative) tweets each user posts. In particular, the mean fraction of positive tweets that each user posts is 0.5625. This value can be interpreted as an indication of not excessive polarization of the users. Therefore, while Figure (*histogram of positive tweets*) shows that there are users who exclusevely produce content associated with positive sentiment only, the number of those is not large enough do drive the mean value to one category only.

In [None]:
#formatting accordingly the date
format_string = '%d %b %Y %H:%M:%S'
date_df = dev.loc[:, ["ids", "date"]]
date_df.date = date_df.date.progress_apply(lambda x: transformDate(x, format_string))
date_df.sort_values(by = "ids", inplace = True)

In [None]:
#visualization for the dependance between ids and date
ids = date_df.ids.values
dates = date_df.date.values
fig,ax = plt.subplots(figsize = (6,4))

xtick = [date_df.iloc[10000, 1], date_df.iloc[200000, 1]]
ytick = [date_df.iloc[10000, 0], date_df.iloc[200000, 0]]

ax = sns.lineplot(x = dates[::50], y = ids[::50])

ax.set_xticks(xtick)
ax.set_xticklabels(xtick, rotation = 15)

ax.set_yticks(ytick)
ax.set_yticklabels(ytick, rotation = 15)

ax.grid()

fig.tight_layout()
fig.savefig("IdsDate.svg")

# Text Cleaning

In [None]:
dev["CleanedText"] = dev.text.progress_apply(lambda row: text_cleaning(row))

#analyzing the effects of the text_cleaning 
dev["IsEmpty"] = dev.CleanedText.progress_apply(lambda x: 1 if len(x) == 0 else 0)
emptyIdx = dev[dev.IsEmpty == 1].index
dev.drop(columns = "IsEmpty", inplace = True)

#discarding the empty text cells
dev.drop(index = emptyIdx, inplace = True)
dev.CleanedText.drop_duplicates(inplace = True)

dev.sentiment.hist()

Considering the above presented structure we have decided to undersample the Positive Tweets group so to have a balanced dataset.  

In [None]:
posDF = dev[dev.sentiment == 1]
negDF = dev[dev.sentiment == 0]

posDF = posDF.sample(n = len(negDF), random_state = rs, replace = False, axis = 0)
dev = pd.concat([posDF, negDF], axis = 0)

# Feature Extraction

In this section we experiment different vectorizers in order to select the best one. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
LR_params = {"solver": "newton-cg",
             "max_iter": 500,
             "n_jobs": -1,
             "random_state": rs,
             "verbose": 1}

MAX_F = np.arange(10000, 310000, 10000)

tfidf_SW_rem = {
    "unigrams": [], 
    "bigrams": [], 
    "trigrams": []
}

cvec_SW_rem = {
    "unigrams": [], 
    "bigrams": [], 
    "trigrams": []
}
y = dev.sentiment.values

#removing the stopwords
for f in tqdm(MAX_F): 
    
    #################### COUNT VECTORIZER ####################
    #unigrams
    CVec_1 = CountVectorizer(ngram_range = (1,1), stop_words = "english", max_features = f)
    CX1 = CVec_1.fit_transform(dev.CleanedText)
    print("CVec_1:ok")
    #bigrams
    CVec_2 = CountVectorizer(ngram_range = (1,2), stop_words = "english", max_features = f)
    CX2 = CVec_2.fit_transform(dev.CleanedText)
    print("CVec_2:ok")
    #trigrams
    CVec_3 = CountVectorizer(ngram_range = (1,3), stop_words = "english", max_features = f)
    CX3 = CVec_3.fit_transform(dev.CleanedText)
    print("CVec_3:ok")
    
    C_X1train, C_X1test, C_y1train, C_y1test = train_test_split(C_X1, y, random_state = rs, test_size = 0.05)
    C_X2train, C_X2test, C_y2train, C_y2test = train_test_split(C_X2, y, random_state = rs, test_size = 0.05)
    C_X3train, C_X3test, C_y3train, C_y3test = train_test_split(C_X3, y, random_state = rs, test_size = 0.05)
    
    #################### TFIDF ###############################
    #unigrams
    vec_1 = TfidfVectorizer(ngram_range = (1,1), stop_words = "english", max_features = f)
    T_X1 = vec_1.fit_transform(dev.CleanedText)
    print("tVec_1:ok")
    #bigrams
    vec_2 = TfidfVectorizer(ngram_range = (1,2), stop_words = "english", max_features = f)
    T_X2 = vec_2.fit_transform(dev.CleanedText)
    print("tVec_2:ok")
    #trigrams
    vec_3 = TfidfVectorizer(ngram_range = (1,3), stop_words = "english", max_features = f)
    T_X3 = vec_3.fit_transform(dev.CleanedText)
    print("tVec_3:ok")
    
    T_X1train, T_X1test, T_y1train, T_y1test = train_test_split(T_X1, y, random_state = rs, test_size = 0.05)
    T_X2train, T_X2test, T_y2train, T_y2test = train_test_split(T_X2, y, random_state = rs, test_size = 0.05)
    T_X3train, T_X3test, T_y3train, T_y3test = train_test_split(T_X3, y, random_state = rs, test_size = 0.05)
    
    ########################### PREDICTIONS ###################
    LR = LogisticRegression(**LR_params)
    LR.fit(C_X1train, C_y1train)
    cvec_SW_rem["unigrams"].append(f1_score(C_y1test, LR.predict(C_X1test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(C_X2train, C_y2train)
    cvec_SW_rem["bigrams"].append(f1_score(C_y2test, LR.predict(C_X2test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(C_X3train, C_y3train)
    cvec_SW_rem["trigrams"].append(f1_score(C_y3test, LR.predict(C_X3test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(T_X1train, T_y1train)
    tfidf_SW_rem["unigrams"].append(f1_score(T_y1test, LR.predict(T_X1test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(T_X2train, T_y2train)
    tfidf_SW_rem["bigrams"].append(f1_score(T_y2test, LR.predict(T_X2test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(T_X3train, T_y3train)
    tfidf_SW_rem["trigrams"].append(f1_score(T_y3test, LR.predict(T_X3test), average = "macro"))    


tfidf_with_SW = {
    "unigrams": [], 
    "bigrams": [], 
    "trigrams": []
}

cvec_with_SW = {
    "unigrams": [], 
    "bigrams": [], 
    "trigrams": []
}

#non removing the stopwords
for f in tqdm(MAX_F): 
    
    #################### COUNT VECTORIZER ####################
    #unigrams
    CVec_1 = CountVectorizer(ngram_range = (1,1), stop_words = None, max_features = f)
    CX1 = CVec_1.fit_transform(dev.CleanedText)
    print("CVec_1:ok")
    #bigrams
    CVec_2 = CountVectorizer(ngram_range = (1,2), stop_words = None, max_features = f)
    CX2 = CVec_2.fit_transform(dev.CleanedText)
    print("CVec_2:ok")
    #trigrams
    CVec_3 = CountVectorizer(ngram_range = (1,3), stop_words = None, max_features = f)
    CX3 = CVec_3.fit_transform(dev.CleanedText)
    print("CVec_3:ok")
    
    C_X1train, C_X1test, C_y1train, C_y1test = train_test_split(C_X1, y, random_state = rs, test_size = 0.05)
    C_X2train, C_X2test, C_y2train, C_y2test = train_test_split(C_X2, y, random_state = rs, test_size = 0.05)
    C_X3train, C_X3test, C_y3train, C_y3test = train_test_split(C_X3, y, random_state = rs, test_size = 0.05)
    
    #################### TFIDF ###############################
    #unigrams
    vec_1 = TfidfVectorizer(ngram_range = (1,1), stop_words = None, max_features = f)
    T_X1 = vec_1.fit_transform(dev.CleanedText)
    print("tVec_1:ok")
    #bigrams
    vec_2 = TfidfVectorizer(ngram_range = (1,2), stop_words = None, max_features = f)
    T_X2 = vec_2.fit_transform(dev.CleanedText)
    print("tVec_2:ok")
    #trigrams
    vec_3 = TfidfVectorizer(ngram_range = (1,3), stop_words = None, max_features = f)
    T_X3 = vec_3.fit_transform(dev.CleanedText)
    print("tVec_3:ok")
    
    T_X1train, T_X1test, T_y1train, T_y1test = train_test_split(T_X1, y, random_state = rs, test_size = 0.05)
    T_X2train, T_X2test, T_y2train, T_y2test = train_test_split(T_X2, y, random_state = rs, test_size = 0.05)
    T_X3train, T_X3test, T_y3train, T_y3test = train_test_split(T_X3, y, random_state = rs, test_size = 0.05)
    
    ########################### PREDICTIONS ###################
    LR = LogisticRegression(**LR_params)
    LR.fit(C_X1train, C_y1train)
    cvec_with_SW["unigrams"].append(f1_score(C_y1test, LR.predict(C_X1test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(C_X2train, C_y2train)
    cvec_with_SW["bigrams"].append(f1_score(C_y2test, LR.predict(C_X2test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(C_X3train, C_y3train)
    cvec_with_SW["trigrams"].append(f1_score(C_y3test, LR.predict(C_X3test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(T_X1train, T_y1train)
    tfidf_with_SW["unigrams"].append(f1_score(T_y1test, LR.predict(T_X1test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(T_X2train, T_y2train)
    tfidf_with_SW["bigrams"].append(f1_score(T_y2test, LR.predict(T_X2test), average = "macro"))
    
    LR = LogisticRegression(**LR_params)
    LR.fit(T_X3train, T_y3train)
    tfidf_with_SW["trigrams"].append(f1_score(T_y3test, LR.predict(T_X3test), average = "macro"))    

Here we present the visualizations necessary to convey the informations presented in the previous cell. 

In [None]:
#visualization
fig, ax = plt.subplots(2, 1, figsize = (8,6))

#removing stopwords
ax[0] = plt.subplot(2,1,1)
ax[0].set_title("MacroAvg F1 score removing stopwords")
t1 = ax[0].plot(MAX_F, tfidf_SW_rem["unigrams"], label = "unigrams", linestyle = "--", c = "green")
t2 = ax[0].plot(MAX_F, tfidf_SW_rem["bigrams"], label = "bigrams", linestyle = "--", c = "red")
t3 = ax[0].plot(MAX_F, tfidf_SW_rem["trigrams"], label = "trigrams", linestyle = "--", c = "blue")

c1 = ax[0].plot(MAX_F, cvec_SW_rem["unigrams"], label = "unigrams", c = "green")
c2 = ax[0].plot(MAX_F, cvec_SW_rem["bigrams"], label = "bigrams", c = "red")
c3 = ax[0].plot(MAX_F, cvec_SW_rem["trigrams"], label = "trigrams", c = "blue")

ax[0].set_xlabel("Number of features")
ax[0].set_ylabel("MacroAvg F1 score")

custom_lines = [Line2D([0], [0], color="green", lw=2),
                Line2D([0], [0], color="red", lw=2),
                Line2D([0], [0], color="blue", lw=2)]

CLegend = ax[0].legend(custom_lines, ["unigrams", "bigrams", "trigram"], loc = (1.01,0.1))

custom_stiles = [Line2D([0], [0], color = "grey", ls="--"),
                Line2D([0], [0], color = "grey", ls="-")]

ax[0].grid()
ax[0].legend(custom_stiles, ["TfIdf", "CVec"], loc = (1.04, 0.45))
ax[0].add_artist(CLegend)

#non removing stopwords
ax[1] = plt.subplot(2,1,2)
ax[1].set_title("MacroAvg F1 score without removing stopwords")
t1 = ax[1].plot(MAX_F, tfidf_with_SW["unigrams"], label = "unigrams", linestyle = "--", c = "green")
t2 = ax[1].plot(MAX_F, tfidf_with_SW["bigrams"], label = "bigrams", linestyle = "--", c = "red")
t3 = ax[1].plot(MAX_F, tfidf_with_SW["trigrams"], label = "trigrams", linestyle = "--", c = "blue")

c1 = ax[1].plot(MAX_F, cvec_with_SW["unigrams"], label = "unigrams", c = "green")
c2 = ax[1].plot(MAX_F, cvec_with_SW["bigrams"], label = "bigrams", c = "red")
c3 = ax[1].plot(MAX_F, cvec_with_SW["trigrams"], label = "trigrams", c = "blue")

ax[1].set_xlabel("Number of features")
ax[1].set_ylabel("MacroAvg F1 score")

custom_lines = [Line2D([0], [0], color="green", lw=2),
                Line2D([0], [0], color="red", lw=2),
                Line2D([0], [0], color="blue", lw=2)]

CLegend = ax[1].legend(custom_lines, ["unigrams", "bigrams", "trigram"], loc = (1.01,0.1))

custom_stiles = [Line2D([0], [0], color = "grey", ls="--"),
                Line2D([0], [0], color = "grey", ls="-")]

ax[1].grid()
ax[1].legend(custom_stiles, ["TfIdf", "CVec"], loc = (1.04, 0.45))
ax[1].add_artist(CLegend)

fig.tight_layout()
fig.savefig("VecResults.svg")

# Model

Considering the scant improvement consequent to: 

1) **increasing the value of max_features from 200k to 300k**

2) **adopting trigrams for the vectorization**

We resorted to set max_features to 200k and using bigrams. This improved the efficiency of the computation. 
We tested out three models: 

1) *Logistic Regression*

2) *Multinomial Naive Bayes Classifier*

3) *Linear SVC*

In this section we present the results of these experiments having fixed the vectorization as above mentioned. 

In [None]:
vec_2 = TfidfVectorizer(ngram_range = (1,2), max_features = 200000, stop_words = None)
Xgrid = vec_2.fit_transform(dev.CleanedText)
print("Vectorization completed!")
y = dev.sentiment.values

In [None]:
LR = LogisticRegression(n_jobs = -1, random_state = rs)
SVC = LinearSVC(random_state = rs, verbose = 1)
M_NB = MultinomialNB()

Xtrain, X_test, ytrain, y_test = train_test_split(Xgrid, y, test_size = 0.05, random_state = rs)
LR.fit(Xtrain, ytrain)
SVC.fit(Xtrain, ytrain)
M_NB.fit(Xtrain, ytrain)


LRy_pred_proba = LR.predict_proba(X_test)[::,1]
SVCy_pred_proba = SVC._predict_proba_lr(X_test)[::,1]
MNBy_pred_proba = M_NB.predict_proba(X_test)[::,1]

#logistic regression
LRauc = roc_auc_score(y_test, LRy_pred_proba)

#svc
SVCauc = roc_auc_score(y_test, SVCy_pred_proba)

#multinomial
MNBauc = roc_auc_score(y_test, MNBy_pred_proba)

In [None]:
#logistic regression
LR_f1 = f1_score(y_test, LR.predict(X_test))

#svc
SVC_f1 = f1_score(y_test, SVC.predict(X_test))

#multinomial
MNB_f1 = f1_score(y_test, M_NB.predict(X_test))

In [None]:
print("Logistic Regression", "|", LRauc, "|", LR_f1)
print("Linear SVC         ", "|",SVCauc, "|", SVC_f1)
print("MN Bayes           ", "|",MNBauc, "|", MNB_f1)

Considering these results, we have chosen to further explore for the hyperparamether tuning fase the Logistic Regression Classifier, considering the fact that while there is certainly a small margin (0.01) that it makes it better than the others, Logistic Regression is fully interpretable. 

# Grid search

Here we perform a Grid Search with cross validation in order to obtain the best possible set of hyperparamethers.

In [None]:
f1_scorer = make_scorer(f1_score, average = "macro")

params = {
    "verbose": [1], 
    "n_jobs": [-1],
    "random_state": [rs],
    "C": np.linspace(0.01, 5, 100)
}

clf = GridSearchCV(LogisticRegression(), 
                  params, 
                  scoring = f1_scorer,
                  cv = 8,
                  refit = True, verbose = 10)

clf.fit(Xgrid, y)
bestLR = clf.best_estimator_

In [None]:
print("Best parameters are: ")
print("********************")
bestP = bestLR.get_params()
pprint.pprint(bestP)

# Model evaluation 

Here we use different metrics to evaluate the performance of the best classifier we have found in the hyperparamether tuning. 

In [None]:
#at first we instiate a new classifier (with the best parameters) and we train it on the 95% of the data
roc_LR = LogisticRegression(**bestP)
Xtrain, X_test, ytrain, y_test = train_test_split(Xgrid, y, test_size = 0.05, random_state = rs)
roc_LR.fit(Xtrain, ytrain)

#we then obtain the roc curve
y_pred_proba = roc_LR.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)

In [None]:
fig, ax = plt.subplots(figsize = (6,4))

axx = plt.plot(fpr, tpr, label = "AUC = {:.2f}".format(auc), color = "r")

ax.set_title("ROC Curve")
ax.set_ylabel('True Positive Rate', fontdict = {"fontsize": 13})
ax.set_xlabel('False Positive Rate', fontdict = {"fontsize": 13})
ax.plot([0,1], [0,1], "k--")
ax.legend(loc = "lower right")
fig.savefig("RocCurve.svg")

In [None]:
#here we present the confusion matrix for our classifier
fig, ax = plt.subplots()

plot_confusion_matrix(roc_LR, X_test, y_test, ax = ax,
                        display_labels = ["Positive", "Negative"], cmap = "magma")  
fig.savefig("ConfMatrix.svg")

In [None]:
print("F1 Score reached by the model is {:.4f}".format(f1_score(y_test, roc_LR.predict(X_test), average = "macro")))

# Submission

For the submission (i.e., the prediction on the evaluation set) we use a classifier which is fitted on the whole Xgrid, i.e. the classifier returned by the GridSearch. 

In [None]:
#importing the evaluation data
ev = dt.fread("./data/evaluation.csv").to_pandas()

#cleaning the text
ev["CleanedText"] = ev.text.progress_apply(text_cleaning)

#vectorizing the text with the vectorizer fitted during training
Xev = vec_2.transform(ev.CleanedText)
yev = bestLR.predict(Xev)

#creating a csv for the submission
subs = pd.DataFrame(index = ev.index)
subs.index.name = "Id"
subs["Predicted"] = yev

subs.to_csv("FinalSubmission.csv")

# Explation

Here we present the top words that influence the sentiment attribution

In [None]:
neg_couples = sorted(list(zip(bestLR.coef_.reshape(-1,1), vec_2.get_feature_names())))
pos_couples = sorted(list(zip(bestLR.coef_.reshape(-1,1), vec_2.get_feature_names())), reverse = True)

N = 5

In [None]:
words = pd.DataFrame()
words["Positive Words"] = [pos_couples[j][1] for j in range(N)]
words["Negative Words"] = [neg_couples[j][1] for j in range(N)]
print(words)