In [None]:
#Importing packages
import numpy as np 
import pandas as pd 
import re
import string
import nltk 
import matplotlib.pyplot as plt
import textblob

from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report, confusion_matrix, make_scorer, f1_score,accuracy_score, cohen_kappa_score, log_loss
from sklearn.model_selection import GridSearchCV
%matplotlib inline

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#Loading Train and Test Datasets
df_train = pd.read_csv("sentiment_train.csv")
df_test = pd.read_csv("sentiment_test.csv")

print(df_train.info())
print(df_train.head())

print(df_test.info())
print(df_test.head())

In [None]:
#Function to count punctuation percentages
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

#Calculate Sentence Length
df_train['body_len'] = df_train['Sentence'].apply(lambda x: len(x) - x.count(" "))
#Calculate Punctuation Percentages
df_train['punct%'] = df_train['Sentence'].apply(lambda x: count_punct(x))
#Calculate Sentiment of Sentence
df_train['sentiment'] = df_train['Sentence'].apply(lambda Text: TextBlob(Text).sentiment.polarity)
#Replace numbers with space
df_train['Sentence'] = df_train['Sentence'].str.replace('\d+', ' ')
#Replace any non characters and non spaces with a space
df_train['Sentence'] = df_train['Sentence'].str.replace('[^\w\s]',' ')
#Replace multiple spaces with a single space
df_train['Sentence'] = df_train['Sentence'].str.replace('\s+',' ',regex = True)

In [None]:
#Calculate Sentence Length
df_test['body_len'] = df_test['Sentence'].apply(lambda x: len(x) - x.count(" "))
#Calculate Punctuation Percentages
df_test['punct%'] = df_test['Sentence'].apply(lambda x: count_punct(x))
#Calculate Sentiment of Sentence
df_test['sentiment'] = df_test['Sentence'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
#Replace numbers with space
df_test['Sentence'] = df_test['Sentence'].str.replace('\d+', ' ')
#Replace any non characters and non spaces with a space
df_test['Sentence'] = df_test['Sentence'].str.replace('[^\w\s]',' ')
#Replace multiple spaces with a single space
df_test['Sentence'] = df_test['Sentence'].str.replace('\s+',' ',regex = True)

In [None]:
print(df_train)
print(df_test)

In [None]:
#Function to convert text to lower and stem
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [None]:
X_train = df_train[['Sentence','body_len','punct%','sentiment']]
X_test = df_test[['Sentence','body_len','punct%','sentiment']]
y_train = df_train['Polarity']
y_test = df_test['Polarity']

In [None]:
#TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_features=500, min_df=6, max_df=0.8, analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['Sentence'])

tfidf_train = tfidf_vect_fit.transform(X_train['Sentence'])
tfidf_test = tfidf_vect_fit.transform(X_test['Sentence'])

feature_names = tfidf_vect.get_feature_names()

#Concatenating TF-IDF features with other features
X_train_vect = pd.concat([X_train[['body_len', 'punct%','sentiment']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray(), columns=feature_names)], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%','sentiment']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray(), columns=feature_names)], axis=1)

print(X_train_vect.head())
print(X_test_vect.head())

In [None]:
#Scoring function to optimize for F1 score
scoring_function = make_scorer(f1_score, greater_is_better=True)

#Parameter grid to test various hyper parameter values
param_grid_rf = {
    'max_depth': [80, 100, 110],
    'max_features': [ 4, 5, 6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 300, 500]
}

#Defining the Random Forest Classifier model
classifier_RF = RandomForestClassifier(random_state=72042)

#Hyper-parameter tuning using the function GridSearchCV for maximizing F1 score
#5-fold cross-validation
#Instantiate the grid search model
grid_search_rf = GridSearchCV(estimator = classifier_RF, param_grid = param_grid_rf, 
                          cv = 5, scoring = scoring_function, n_jobs=-1,return_train_score = True, verbose = 5)

In [None]:
#Fitting the model on train dataset
grid_search_RF = grid_search_rf.fit(X_train_vect, y_train)

In [None]:
#Predicting on Test dataset
predictions = grid_search_RF.predict(X_test_vect)

In [None]:
#Results on Test dataset
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print("Accuracy = {:.2f}".format(accuracy_score(y_test, predictions)))
print("F1 Score = {:.2f}".format(f1_score(y_test, predictions)))

In [None]:
feature_names = ['body_len', 'punct%','sentiment'] + tfidf_vect.get_feature_names()

In [None]:
from treeinterpreter import treeinterpreter as ti

prediction, bias, contributions = ti.predict(grid_search_RF.best_estimator_, X_test_vect)

for i in range(len(X_test_vect)):
    if y_test[i] == predictions[i]:
        continue
    print("Instance {}".format(i))
    df_test['Sentence'].iloc[i]
    print("Bias (trainset mean) {}".format(bias[i]))
    print("Truth {}".format(y_test[i]))
    print("Prediction {}".format(prediction[i, :]))
    print("Feature contributions:")
    con = pd.DataFrame(data={print('feature': feature_names),
                             print('value': X_test_vect.iloc[i]),
                             print('negative contr': contributions[i][:, 0],
                             print('positive contr': contributions[i][:, 1],
                             print('abs contr': abs(contributions[i][:, 1]))})
    con = con.sort_values(by="abs contr", ascending=False)
    con['polarity cumulative'] = con['negative contr'].cumsum() + bias[i][1]
    con.head(10)
    print("-"*20) 