In [1]:
import pandas as pd
import numpy as np
import re
import string

import codecs

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn import feature_extraction
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import metrics


# Models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

from sklearn import metrics
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv('train.csv', encoding='UTF-8')
test_df = pd.read_csv('test_with_no_labels.csv' ,encoding='UTF-8')
df_copy = df.copy()

In [3]:
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
df.describe()

Unnamed: 0,sentiment,tweetid
count,15819.0,15819.0
mean,0.917504,501719.433656
std,0.836537,289045.983132
min,-1.0,6.0
25%,1.0,253207.5
50%,1.0,502291.0
75%,1.0,753769.0
max,2.0,999888.0


In [5]:
#Checking for null objects in train data
df.isnull().sum()

sentiment    0
message      0
tweetid      0
dtype: int64

In [6]:
#Checking the length of the characters
df_copy['length'] = df_copy['message'].apply(lambda x: len(x))

In [7]:
display(df.shape)

(15819, 3)

In [8]:
value_counts = df["tweetid"].value_counts()
value_counts.name = "Raw Number"

value_normd = df["tweetid"].value_counts(normalize=True)
value_normd.name = "Percentage"

display(pd.concat([value_counts, value_normd], axis=1))

Unnamed: 0,Raw Number,Percentage
755713,1,0.000063
726392,1,0.000063
994687,1,0.000063
675203,1,0.000063
386438,1,0.000063
...,...,...
230309,1,0.000063
39846,1,0.000063
594855,1,0.000063
615337,1,0.000063


In [9]:
#Remove Punctuations
def _remove_punc(x):
    x = re.sub(r'[-]',' ',x)
    x = re.sub(r'[_]', ' ', x)
    x = re.sub(r'[^\w\s]','',x)
    x = re.sub('[0-9]+', '', x)
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    return x

#Apply the function to the dataset
df_copy['clean_punc'] = df_copy['message'].apply(_remove_punc)

In [10]:
#Making the characters in the text a lower case
def _lower(x):
    return x.lower()
df_copy['lower'] = df_copy['clean_punc'].apply(_lower)

In [11]:
from nlppreprocess import NLP
import nltk

In [12]:
#Remove Stopwords
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(x):
    stopwords = NLP(replace_words=True, remove_stopwords=True, 
                            remove_numbers=True, remove_punctuations=False) 
    x = stopwords.process(x)
    return x

#Applying the function to the dataset   
df_copy['Text_nonstop'] = df_copy['lower'].apply(lambda x: remove_stopwords(x))

In [13]:
def _analyzer (x):
    x = _remove_punc(x)
    x = _lower(x)
    x = remove_stopwords(x)
    return x

In [14]:
df_copy.head()

Unnamed: 0,sentiment,message,tweetid,length,clean_punc,lower,Text_nonstop
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,140,PolySciMajor EPA chief doesnt think carbon dio...,polyscimajor epa chief doesnt think carbon dio...,polyscimajor epa chief not think carbon dioxid...
1,1,It's not like we lack evidence of anthropogeni...,126103,62,Its not like we lack evidence of anthropogenic...,its not like we lack evidence of anthropogenic...,its not like we lack evidence anthropogenic gl...
2,2,RT @RawStory: Researchers say we have three ye...,698562,140,RT RawStory Researchers say we have three year...,rt rawstory researchers say we have three year...,rt rawstory researchers say we three years act...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,99,TodayinMaker WIRED was a pivotal year in the...,todayinmaker wired was a pivotal year in the...,todayinmaker wired pivotal year in war climate...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,122,RT SoyNovioDeTodas Its and a racist sexist cl...,rt soynoviodetodas its and a racist sexist cl...,rt soynoviodetodas its and racist sexist clima...


In [15]:
df['cleaned'] = df['message'].apply(_analyzer)
test_df['cleaned'] = test_df['message'].apply(_analyzer)

In [16]:
# Splitting  X (indepedent) and Y (target/dependent) variables
X = df['cleaned']
y = df['sentiment']

In [17]:
X_train , X_test , y_train , y_test = train_test_split(X , y, stratify=y,
                                                       test_size =0.4, 
                                                       random_state=42)

In [18]:
algo = [LogisticRegression(random_state =42 , max_iter=5000) , 
       MultinomialNB(), LinearSVC(random_state=42), 
       SGDClassifier(random_state=42), RidgeClassifier(random_state=42)]

In [19]:
def _performace_assesment(*args , **kwargs):
  model_stats = {}
  for clf in algo:
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                      ('clf' , clf)
                      ])
    
    model.fit(X_train, y_train) #Training
    model_pred = model.predict(X_test) #Testing

    # Dictionary of Models Performances
    model_stats[clf.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, model_pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, model_pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, model_pred, average='weighted')}
  return pd.DataFrame.from_dict(model_stats, orient='index')

In [20]:
performance = _performace_assesment(algo , X_train , X_test , y_train , y_test)
performance.to_csv('performance.csv')
dataframe = pd.read_csv('performance.csv', index_col = 0)
dataframe.sort_values('F1-Weighted', ascending=False)

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
LinearSVC,0.640423,0.741308,0.724732
RidgeClassifier,0.62763,0.737832,0.717328
SGDClassifier,0.63053,0.73641,0.716903
LogisticRegression,0.578848,0.722977,0.692409
MultinomialNB,0.179813,0.541245,0.382343


In [21]:
def _param_tuning(*args , **kwargs):
  best_params = {}

  for clf in algo:
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                      ('clf' , clf)])
    model.fit(X_train, y_train) #Training
    
    #Get models performing parameters
    params = model.get_params()
    model_name = clf.__class__.__name__ 
    model_name = {}
    for key in params:
      if key.startswith("clf"):
        if len(key) < 5:
          model_name['model'] = params[key]
        else:
            model_name[key[5:]] = params[key]
    best_params[clf.__class__.__name__] = model_name
  return best_params

In [22]:
best_params = _param_tuning(algo, X_train, y_train)

In [23]:
best_params

{'LogisticRegression': {'model': LogisticRegression(max_iter=5000, random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 5000,
  'multi_class': 'auto',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': 42,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'MultinomialNB': {'model': MultinomialNB(),
  'alpha': 1.0,
  'class_prior': None,
  'fit_prior': True},
 'LinearSVC': {'model': LinearSVC(random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': True,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'loss': 'squared_hinge',
  'max_iter': 1000,
  'multi_class': 'ovr',
  'penalty': 'l2',
  'random_state': 42,
  'tol': 0.0001,
  'verbose': 0},
 'SGDClassifier': {'model': SGDClassifier(random_state=42),
  'alpha': 0.0001,
  'average': False,
  'class_weight': None,
  'early_stopping': False,
  'epsilon': 0.1,
  'eta0': 0.0,
  'fit_intercept': 

In [24]:
#model
model1 = MultinomialNB()

In [25]:
Vectorize = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')
X_train = Vectorize.fit_transform(X_train)
X_test = Vectorize.transform(X_test)

In [26]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True,
                                   random_state=42)

In [27]:
best_params[algo[1].__class__.__name__]

{'model': MultinomialNB(),
 'alpha': 1.0,
 'class_prior': None,
 'fit_prior': True}

In [28]:
alpha = list(np.linspace(0.1,0.02,4))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model1,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [29]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [30]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.7042913921456904
Test score: 0.7080661722067843


MultinomialNB(alpha=0.02)

In [31]:
#model
model2 = RidgeClassifier()

In [32]:
best_params[algo[4].__class__.__name__]

{'model': RidgeClassifier(random_state=42),
 'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': 'deprecated',
 'positive': False,
 'random_state': 42,
 'solver': 'auto',
 'tol': 0.001}

In [33]:
alpha = list(np.linspace(0.15,0.4, 5))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model2,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [34]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [35]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.7193746795060804
Test score: 0.7185523911642698


RidgeClassifier(alpha=0.3375)

In [36]:
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline

In [37]:
X_train , X_test , y_train , y_test = train_test_split(X, y,  stratify=y, test_size=0.4, random_state =1)

In [38]:
vect = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(2, 6), analyzer= 'char')
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [39]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2)]
final_est = RidgeClassifier(alpha=0.2125)

In [40]:
stacking_NB2 = StackingClassifier(estimators = estimators,
                           final_estimator = final_est,
                           passthrough = True)

In [41]:
stacking_NB2.fit(X_train , y_train)

StackingClassifier(estimators=[('multiNB1', MultinomialNB(alpha=0.1)),
                               ('multiNB2', MultinomialNB(alpha=0.1))],
                   final_estimator=RidgeClassifier(alpha=0.2125),
                   passthrough=True)

In [42]:
pred = stacking_NB2.predict(X_test)

In [43]:
model_stats = {}
model_stats[stacking_NB2.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
StackingClassifier,0.634052,0.74036,0.719919


In [44]:
count_vec = CountVectorizer(ngram_range=(3,7), analyzer= 'char')
X_train , X_test , y_train , y_test = train_test_split(X, y, stratify=y,test_size=0.05, random_state =1)
X_train = count_vec.fit_transform(X_train)
X_test = count_vec.transform(X_test)

In [45]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)
multiNB3 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2), ('multiNB3', multiNB3)]
final_est = RidgeClassifier(alpha=0.2125)

In [46]:
stacking_NB3 = StackingClassifier(estimators = estimators,
                           final_estimator = final_est,
                           passthrough = True)

In [47]:
stacking_NB3.fit(X_train , y_train)

StackingClassifier(estimators=[('multiNB1', MultinomialNB(alpha=0.1)),
                               ('multiNB2', MultinomialNB(alpha=0.1)),
                               ('multiNB3', MultinomialNB(alpha=0.1))],
                   final_estimator=RidgeClassifier(alpha=0.2125),
                   passthrough=True)

In [48]:
pred = stacking_NB3.predict(X_test)

In [49]:
model_stats = {}
model_stats[stacking_NB3.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
StackingClassifier,0.697838,0.76359,0.758913


In [50]:
X = test_df['cleaned']
Vectorize = vect.transform(X)

In [None]:
X = test_df['cleaned']
Vectorize = vect.transform(X)

In [52]:
stacking_pred = stacking_NB2.predict(Vectorize)

In [53]:
tweetid = test_df.tweetid.values

In [54]:
submission12 = pd.DataFrame(list(zip(tweetid, stacking_pred)), columns = ['tweetid','sentiment'])
submission12.head(10)

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,1
5,75639,1
6,211536,1
7,569434,2
8,315368,2
9,591733,1


In [55]:
submission12.to_csv('T10.csv',index=False)