In [32]:
#importing neccesary packages
import pandas as pd
import numpy as np
import nltk
import spacy
import matplotlib.pyplot as plt
import re
import string
import warnings
from IPython.core.display import display
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from xgboost import XGBClassifier, DMatrix, cv
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train1.csv')

In [3]:
df.head()

Unnamed: 0,text,sentiment
0,"oh Marly, I`m so sorry!! I hope you find her...",neutral
1,Playing Ghost Online is really interesting. Th...,positive
2,is cleaning the house for her family who is co...,neutral
3,gotta restart my computer .. I thought Win7 wa...,neutral
4,SEe waT I Mean bOuT FoLL0w fRiiDaYs... It`S cA...,neutral


In [4]:
#Checking for null values 
df.isnull().sum()

text         1
sentiment    0
dtype: int64

In [5]:
#droping null values 
df.dropna(inplace=True)

In [6]:
#Checking if there are any null strings and removing them
blanks = [] 

for i,txt,lb in df.itertuples():  # iterate over the DataFrame
    if type(txt)==str:            # avoid NaN values
        if txt.isspace():         # test 'text' for whitespace
            blanks.append(i)     # add matching index numbers to the list

df.drop(blanks, inplace=True)

In [7]:
#Checking the count of each sentiment 
df['sentiment'].value_counts()

neutral     11105
positive     8575
negative     7767
Name: sentiment, dtype: int64

#### To start with ,we will be using NLTK's vader sentiment analyzer and check it's accuracy

In [8]:
sid = SentimentIntensityAnalyzer()

In [9]:
#For converting score to sentiment
def sentiment(score):
    '''
    score: compund score from vader sentiment intensity analyzer
    '''
    if score>0:
        return 'positive'
    elif score<0:
        return 'negative'
    else:
        return 'netural'

In [10]:
# getting the polarity scores for the tweets
df['scores'] = df['text'].apply(lambda text: sid.polarity_scores(text))
#getting the compund score for the tweets
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
#converting compund score to sentiment
df['comp_score'] = df['compound'].apply(sentiment)

df.head()

Unnamed: 0,text,sentiment,scores,compound,comp_score
0,"oh Marly, I`m so sorry!! I hope you find her...",neutral,"{'neg': 0.088, 'neu': 0.402, 'pos': 0.51, 'com...",0.8557,positive
1,Playing Ghost Online is really interesting. Th...,positive,"{'neg': 0.085, 'neu': 0.738, 'pos': 0.177, 'co...",0.3597,positive
2,is cleaning the house for her family who is co...,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,netural
3,gotta restart my computer .. I thought Win7 wa...,neutral,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,netural
4,SEe waT I Mean bOuT FoLL0w fRiiDaYs... It`S cA...,neutral,"{'neg': 0.333, 'neu': 0.667, 'pos': 0.0, 'comp...",-0.6124,negative


In [11]:
print("Accuracy of vader sentiment analyzer")
accuracy_score(df['sentiment'],df['comp_score'])


Accuracy of vader sentiment analyzer


0.4427077640543593

In [12]:
#removing vader's predictions 
df.drop(['scores','compound','comp_score'],inplace=True,axis=1)

#### Cleaning the tweets and using other models

In [13]:
#function to clean the tweets
nlp = spacy.load("en")
sp = spacy.load('en_core_web_md')
all_stopwords = sp.Defaults.stop_words
def process_tweets(text):    
    text = str(text).lower() #lower
    text = re.sub('\[.*?\]', '', text) #Remove text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) #Hyperlinks removal
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #punctuations
    text = re.sub('\n', '', text) #newlines
    text = re.sub('\w*\d\w*', '', text) #word containing numbers
    tokens = word_tokenize(text) #tokenizing the tweet
    filtered_sentence = " ".join([w for w in tokens if not w in all_stopwords]) #removing stopwords
    clean_text = " ".join([w.lemma_ for w in nlp(filtered_sentence)]) #Lemmatization of words
    return clean_text

In [14]:
df['clean_text'] = df['text'].apply(lambda x:process_tweets(x))

In [15]:
accuracy_list = []
model_list = []
def output_metrics(predictions):
    ''' 
    Predictions are the predicted values for the test data 
    '''
    print("-------Accuracy Score--------")
    print(accuracy_score(y_test, predictions))
    accuracy_list.append(accuracy_score(y_test, predictions))
    print('\n')
    print("-------Classification Report--------")
    print(classification_report(y_test,predictions))
    print("-------Confusion Matrix--------")
    conf_mat = confusion_matrix(y_test,predictions)
    conf_mat_df = pd.DataFrame(data=conf_mat,columns=['negative','neutral','positive'],index=['negative','neutral','positive'])
    display(conf_mat_df)
    
    
def train_model(model,X_train,y_train,X_test,y_test):
    '''
    model            :  Model which is going to be used 
    X_train ,y_train :  features and labels of training data
    X_test, y_test   :  features and labels of testing data
    '''
    model_list.append(model)
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    tweet_clf = Pipeline([('tfidf', TfidfVectorizer()),
                         ('model', model)])                    #doing Tf-idf on the data 
    scores = cross_val_score(tweet_clf, X_train, y_train, cv=cv)
    print("Cross Validation scores")
    print(scores)
    tweet_clf.fit(X_train,y_train)
    predictions = tweet_clf.predict(X_test)
    output_metrics(predictions)
    

In [16]:
#Splitting the data into testing and training set
X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

### Models

#### SVM

In [17]:
svm = SVC()
train_model(svm, X_train, y_train, X_test, y_test)

Cross Validation scores
[0.41082582 0.39746704 0.41498959 0.40857044 0.41377516]
-------Accuracy Score--------
0.39526411657559196


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00      2307
     neutral       0.40      1.00      0.57      3255
    positive       0.00      0.00      0.00      2673

   micro avg       0.40      0.40      0.40      8235
   macro avg       0.13      0.33      0.19      8235
weighted avg       0.16      0.40      0.22      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,0,2307,0
neutral,0,3255,0
positive,0,2673,0


#### Decision Tree

In [18]:
dt =  DecisionTreeClassifier()
train_model(dt, X_train, y_train, X_test, y_test)

Cross Validation scores
[0.64555864 0.64503817 0.63202637 0.64104788 0.64208883]
-------Accuracy Score--------
0.6563448694596236


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.62      0.60      0.61      2307
     neutral       0.63      0.65      0.64      3255
    positive       0.72      0.72      0.72      2673

   micro avg       0.66      0.66      0.66      8235
   macro avg       0.66      0.65      0.66      8235
weighted avg       0.66      0.66      0.66      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,1388,691,228
neutral,629,2103,523
positive,222,537,1914


#### Random Forest

In [19]:
rf =  RandomForestClassifier()
train_model(rf, X_train, y_train, X_test, y_test)

Cross Validation scores
[0.65423317 0.65943789 0.66221374 0.66134629 0.65787647]
-------Accuracy Score--------
0.6633879781420765


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.65      0.59      0.62      2307
     neutral       0.61      0.71      0.66      3255
    positive       0.75      0.66      0.71      2673

   micro avg       0.66      0.66      0.66      8235
   macro avg       0.67      0.66      0.66      8235
weighted avg       0.67      0.66      0.66      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,1372,761,174
neutral,537,2315,403
positive,190,707,1776


#### Multinomial Naive bayes

In [20]:
mnb = MultinomialNB()
train_model(mnb, X_train, y_train, X_test, y_test)

Cross Validation scores
[0.60843164 0.58657183 0.62092297 0.60704372 0.60877863]
-------Accuracy Score--------
0.6195506982392228


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.77      0.42      0.55      2307
     neutral       0.53      0.83      0.64      3255
    positive       0.78      0.53      0.63      2673

   micro avg       0.62      0.62      0.62      8235
   macro avg       0.69      0.60      0.61      8235
weighted avg       0.67      0.62      0.61      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,978,1236,93
neutral,237,2697,321
positive,61,1185,1427


#### KNN

In [23]:
knn =  KNeighborsClassifier()
train_model(knn, X_train, y_train, X_test, y_test)

Cross Validation scores
[0.46582235 0.4611381  0.47866065 0.46877169 0.48299792]
-------Accuracy Score--------
0.46885245901639344


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.56      0.18      0.27      2307
     neutral       0.43      0.90      0.58      3255
    positive       0.74      0.20      0.32      2673

   micro avg       0.47      0.47      0.47      8235
   macro avg       0.58      0.42      0.39      8235
weighted avg       0.57      0.47      0.41      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,408,1844,55
neutral,207,2914,134
positive,108,2026,539


#### SGD

In [24]:
sgd = SGDClassifier()
train_model(sgd, X_train, y_train, X_test, y_test)

Cross Validation scores
[0.68355309 0.67713393 0.67574601 0.67609299 0.68771686]
-------Accuracy Score--------
0.6925318761384335


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.73      0.57      0.64      2307
     neutral       0.62      0.75      0.68      3255
    positive       0.78      0.72      0.75      2673

   micro avg       0.69      0.69      0.69      8235
   macro avg       0.71      0.68      0.69      8235
weighted avg       0.70      0.69      0.69      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,1324,833,150
neutral,404,2455,396
positive,86,663,1924


#### XGBoost

In [25]:
XGB = XGBClassifier()
train_model(XGB, X_train, y_train, X_test, y_test)



Cross Validation scores
[0.68494101 0.67418459 0.67834837 0.68025677 0.68442054]
-------Accuracy Score--------
0.6907103825136612


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.75      0.53      0.62      2307
     neutral       0.61      0.79      0.69      3255
    positive       0.79      0.71      0.75      2673

   micro avg       0.69      0.69      0.69      8235
   macro avg       0.72      0.68      0.69      8235
weighted avg       0.71      0.69      0.69      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,1218,948,141
neutral,323,2579,353
positive,85,697,1891


#### Hyper parameter tuning for XGBoost

In [26]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [27]:
def change_to_number(sentiment):
    if sentiment=='neutral':
        return 0
    elif sentiment=='positive':
        return 1
    else:
        return 2

In [28]:
y = df['sentiment']
y = y.apply(change_to_number)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [30]:
#citation : https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
def modelfit(alg, X_train, y_train, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgb_param['num_class'] = 3
        xgtrain = DMatrix(X_train, label=y_train)
        cvresult = cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='merror', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train, eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_test)
#     dtrain_predprob = alg.predict_proba(X_test)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % accuracy_score(y_test, dtrain_predictions))

                    


In [33]:
xgb1 = XGBClassifier(
 learning_rate =0.2,
 n_estimators=1000,
 max_depth=5,
 
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 
 seed=27)
modelfit(xgb1,X_train, y_train, X_test, y_test)


Model Report
Accuracy : 0.6942


In [34]:
from sklearn.model_selection import GridSearchCV
param_test1 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.2, n_estimators=200, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', scale_pos_weight=1,seed=27), 
 param_grid = param_test1, iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_
max_depth_opt = gsearch1.best_params_['max_depth']
min_child_weight_opt = gsearch1.best_params_['min_child_weight']

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

In [35]:
#Optimal parameters and the best accuracy score
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 6, 'min_child_weight': 4}, 0.6793674227754838)

In [36]:
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.2, n_estimators=200, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', scale_pos_weight=1,seed=27), 
 param_grid = param_test2, n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_
gamma_opt =gsearch2.best_params_['gamma'] 

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [37]:
gsearch2.best_params_, gsearch2.best_score_

({'gamma': 0.3}, 0.6721321363788999)

In [38]:
param_test3 = {
'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.2, n_estimators=150, max_depth=6,
 min_child_weight=4, gamma=0.4, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax',  scale_pos_weight=1,seed=27), 
 param_grid = param_test3, iid=False, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_
subsample_opt = gsearch3.best_params_['subsample']
colsample_bytree_opt = gsearch3.best_params_['colsample_bytree']

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

In [39]:
gsearch3.best_params_, gsearch3.best_score_

({'colsample_bytree': 0.7, 'subsample': 0.9}, 0.6793674905041083)

In [None]:
#Run this code to get the optimal parameters for XGBoost model 

# param_test6 = {
#      'max_depth':[4,5,6],
#      'min_child_weight':[4,5,6],
#      'gamma':[i/10.0 for i in range(0,5)],
#      'subsample':[i/10.0 for i in range(6,10)],
#      'colsample_bytree':[i/10.0 for i in range(6,10)],
#      'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
# }
# gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.2, n_estimators=150, max_depth=6,
#  min_child_weight=4, gamma=0.4, subsample=0.8, colsample_bytree=0.8,
#  objective= 'multi:softmax',  scale_pos_weight=1,seed=27), 
#  param_grid = param_test6, iid=False, cv=5)
# gsearch6.fit(X_train,y_train)
# gsearch6.best_params_, gsearch6.best_score_

In [40]:
#Trying with learning rate =0.2
xgb_tuned = XGBClassifier(
 learning_rate =0.2,
 n_estimators=1000,
 max_depth=max_depth_opt,
 min_child_weight=min_child_weight_opt,
 
 gamma=gamma_opt,
 subsample=subsample_opt,
 colsample_bytree=colsample_bytree_opt,
 objective= 'multi:softmax',
 
 seed=27)
modelfit(xgb_tuned,X_train, y_train, X_test, y_test)


Model Report
Accuracy : 0.6948


In [43]:
#Trying with learning rate =0.1
xgb_tuned2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=max_depth_opt,
 min_child_weight=min_child_weight_opt,
 
 gamma=gamma_opt,
 subsample=subsample_opt,
 colsample_bytree=colsample_bytree_opt,
 objective= 'multi:softmax',
 
 seed=27)
modelfit(xgb_tuned2,X_train, y_train, X_test, y_test)


Model Report
Accuracy : 0.6919


In [48]:
model_list.append(xgb_tuned)
accuracy_list.append(0.6948)
model_accu_list = zip(model_list,accuracy_list)
model_accu_list= list(model_accu_list)
report_df = pd.DataFrame(model_accu_list,columns=["Model","Accuracy"])
report_df = report_df.sort_values(by=['Accuracy'],ascending=False)
report_df

Unnamed: 0,Model,Accuracy
11,"XGBClassifier(base_score=None, booster=None, c...",0.6948
5,"SGDClassifier(alpha=0.0001, average=False, cla...",0.692532
7,"XGBClassifier(base_score=None, booster=None, c...",0.6919
8,"XGBClassifier(base_score=None, booster=None, c...",0.6919
9,"XGBClassifier(base_score=None, booster=None, c...",0.6919
10,"XGBClassifier(base_score=None, booster=None, c...",0.6919
6,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.69071
2,"(DecisionTreeClassifier(class_weight=None, cri...",0.663388
1,"DecisionTreeClassifier(class_weight=None, crit...",0.656345
3,"MultinomialNB(alpha=1.0, class_prior=None, fit...",0.619551


In [49]:
from sklearn.externals import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(xgb_tuned, 'xgb.pkl') 

['xgb.pkl']

In [53]:
#prediction.py for train.csv
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import string
import warnings
from IPython.core.display import display
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier, DMatrix, cv
warnings.filterwarnings('ignore')

df = pd.read_csv('train1.csv')

df.dropna(inplace=True)

blanks = [] 
for i,txt,lb in df.itertuples():  # iterate over the DataFrame
    if type(txt)==str:            # avoid NaN values
        if txt.isspace():         # test 'text' for whitespace
            blanks.append(i)     # add matching index numbers to the list

df.drop(blanks, inplace=True)

nlp = spacy.load("en")
sp = spacy.load('en_core_web_md')
all_stopwords = sp.Defaults.stop_words

def process_tweets(text):    
    text = str(text).lower() #lower
    text = re.sub('\[.*?\]', '', text) #Remove text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) #Hyperlinks removal
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #punctuations
    text = re.sub('\n', '', text) #newlines
    text = re.sub('\w*\d\w*', '', text) #word containing numbers
    tokens = word_tokenize(text) #tokenizing the tweet
    filtered_sentence = " ".join([w for w in tokens if not w in all_stopwords]) #removing stopwords
    clean_text = " ".join([w.lemma_ for w in nlp(filtered_sentence)]) #Lemmatization of words
    return clean_text

df['clean_text'] = df['text'].apply(lambda x:process_tweets(x))

X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

def output_metrics(predictions):
    ''' 
    Predictions are the predicted values for the test data 
    '''
    print("-------Accuracy Score--------")
    print(accuracy_score(y_test, predictions))
    accuracy_list.append(accuracy_score(y_test, predictions))
    print('\n')
    print("-------Classification Report--------")
    print(classification_report(y_test,predictions))
    print("-------Confusion Matrix--------")
    conf_mat = confusion_matrix(y_test,predictions)
    conf_mat_df = pd.DataFrame(data=conf_mat,columns=['negative','neutral','positive'],index=['negative','neutral','positive'])
    display(conf_mat_df)
    
    
def train_model(model,X_train,y_train,X_test,y_test):
    '''
    model            :  Model which is going to be used 
    X_train ,y_train :  features and labels of training data
    X_test, y_test   :  features and labels of testing data
    '''
    model_list.append(model)
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    tweet_clf = Pipeline([('tfidf', TfidfVectorizer()),
                         ('model', model)])                    #doing Tf-idf on the data 
    scores = cross_val_score(tweet_clf, X_train, y_train, cv=cv)
    print("Cross Validation scores")
    print(scores)
    tweet_clf.fit(X_train,y_train)
    predictions = tweet_clf.predict(X_test)
    output_metrics(predictions)

xgb = joblib.load('xgb.pkl') 
train_model(xgb, X_train, y_train, X_test, y_test)





Cross Validation scores
[0.68060375 0.67539903 0.67106176 0.67938931 0.68025677]
-------Accuracy Score--------
0.699210686095932


-------Classification Report--------
              precision    recall  f1-score   support

    negative       0.73      0.59      0.65      2307
     neutral       0.63      0.76      0.69      3255
    positive       0.79      0.72      0.76      2673

   micro avg       0.70      0.70      0.70      8235
   macro avg       0.72      0.69      0.70      8235
weighted avg       0.71      0.70      0.70      8235

-------Confusion Matrix--------


Unnamed: 0,negative,neutral,positive
negative,1354,831,122
neutral,396,2477,382
positive,109,637,1927


In [None]:
#prediction.py
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import string
import warnings
from IPython.core.display import display
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier, DMatrix, cv
warnings.filterwarnings('ignore')

df = pd.read('validation.csv')

df.dropna(inplace=True)

blanks = [] 
for i,txt,lb in df.itertuples():  # iterate over the DataFrame
    if type(txt)==str:            # avoid NaN values
        if txt.isspace():         # test 'text' for whitespace
            blanks.append(i)     # add matching index numbers to the list

df.drop(blanks, inplace=True)

nlp = spacy.load("en")
sp = spacy.load('en_core_web_md')
all_stopwords = sp.Defaults.stop_words

def process_tweets(text):    
    text = str(text).lower() #lower
    text = re.sub('\[.*?\]', '', text) #Remove text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) #Hyperlinks removal
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #punctuations
    text = re.sub('\n', '', text) #newlines
    text = re.sub('\w*\d\w*', '', text) #word containing numbers
    tokens = word_tokenize(text) #tokenizing the tweet
    filtered_sentence = " ".join([w for w in tokens if not w in all_stopwords]) #removing stopwords
    clean_text = " ".join([w.lemma_ for w in nlp(filtered_sentence)]) #Lemmatization of words
    return clean_text

df['clean_text'] = df['text'].apply(lambda x:process_tweets(x))

X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

def output_metrics(predictions):
    ''' 
    Predictions are the predicted values for the test data 
    '''
    print("-------Accuracy Score--------")
    print(accuracy_score(y_test, predictions))
    accuracy_list.append(accuracy_score(y_test, predictions))
    print('\n')
    print("-------Classification Report--------")
    print(classification_report(y_test,predictions))
    print("-------Confusion Matrix--------")
    conf_mat = confusion_matrix(y_test,predictions)
    conf_mat_df = pd.DataFrame(data=conf_mat,columns=['negative','neutral','positive'],index=['negative','neutral','positive'])
    display(conf_mat_df)
    
    
def train_model(model,X_train,y_train,X_test,y_test):
    '''
    model            :  Model which is going to be used 
    X_train ,y_train :  features and labels of training data
    X_test, y_test   :  features and labels of testing data
    '''
    model_list.append(model)
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    tweet_clf = Pipeline([('tfidf', TfidfVectorizer()),
                         ('model', model)])                    #doing Tf-idf on the data 
    scores = cross_val_score(tweet_clf, X_train, y_train, cv=cv)
    print("Cross Validation scores")
    print(scores)
    tweet_clf.fit(X_train,y_train)
    predictions = tweet_clf.predict(X_test)
    output_metrics(predictions)

xgb = joblib.load('xgb.pkl') 
train_model(xgb, X_train, y_train, X_test, y_test)



