# Part 2: Model Build and Evaluation

This notebook is structured to help guide you through the second half of this challenge. If additional cells are needed to build and train your classifier, please feel free to use additional cells. Otherwise please refrain from adding cells at any point in the notebook during this challenge. Please also do not delete or modify the provided headers to the cells. You are welcome to additional comments, though, if needed! Thank you!

### Import your libraries in the cell below

In [1]:
import sys
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

from sklearn.metrics import classification_report, accuracy_score, f1_score, GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/giguser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import in your csv from the previous notebook in the cell below

In [2]:
df = pd.read_csv('train.csv', index_col='Unnamed: 0')
test_df = pd.read_csv('test.csv', index_col='Unnamed: 0')

### Build and Train your Classifier in this and the following cell(s) 

In [3]:
# preprocessing
X_train = df.drop(columns =['Sentiment'])
y_train = df.loc[:, 'Sentiment']

X_test = df.drop(columns =['Sentiment'])
y_test = df.loc[:, 'Sentiment']

# remove stopwords 
stop = stopwords.words('english')
X_train['Phrase'] = X_train['Phrase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# text vectorization:  transform our data into vectors
# UNIGRAM
uni_vectorizer = CountVectorizer(ngram_range=(1, 1))
uni_vectorizer.fit(X_train['Phrase'])
uni_X = uni_vectorizer.transform(X_train['Phrase']) 
uni_test_X =  uni_vectorizer.transform(X_test['Phrase'])

#tf-itf, term frequency inverse term frequency:
#UNIGRAM TF-ITF
uni_tfitf = TfidfTransformer().fit(uni_X)
uni_tf_X = uni_tfitf.transform(uni_X)
uni_test_tf_X = uni_tfitf.transform(uni_test_X)

# BIGRAM
bi_vectorizer = CountVectorizer(ngram_range=(1, 2))
bi_vectorizer.fit(X_train['Phrase'])
bi_X = bi_vectorizer.transform(X_train['Phrase']) 
bi_test_X = bi_vectorizer.transform(X_test['Phrase']) 

#BIGRAM TF-ITF
bi_tfitf = TfidfTransformer().fit(bi_X)
bi_tf_X = bi_tfitf.transform(bi_X)
bi_test_tf_X = bi_tfitf.transform(bi_test_X)

In [4]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, stratify=y)
    
    #clf = RandomForestClassifier(max_depth=2).fit(X_train, y_train)
    #clf = SGDClassifier().fit(X_train, y_train)    
    
    # fine tune here
    parameters = {'max_iter':[50, 100, 1000, 5000]}
    sgd = SGDClassifier()
    clf = GridSearchCV(sgd, parameters, scoring='f1_macro')

    #parameters = {'max_depth':[1, 2, 3]}
    #rfc = RandomForestClassifier()
    #clf = GridSearchCV(rfc, parameters, scoring='f1_macro')
    

    clf.fit(X_train, y_train)

    fclf = clf.best_estimator_
    print('best params ', clf.best_params_)
    
    y_valid_pred = fclf.predict(X_valid)
    print('Classification report ', classification_report(y_valid_pred, y_valid, zero_division=1))

    print('Accuracy score: ', accuracy_score(y_valid, y_valid_pred))
    print('F1 macro score: ', f1_score(y_valid, y_valid_pred, average='macro', zero_division=1))
    print('F1 weighted score: ', f1_score(y_valid, y_valid_pred, average='weighted', zero_division=1))
    
    return fclf

In [5]:
uni_clf = train_and_show_scores(uni_X, y_train, 'Unigram Counts')
uni_tf_clf = train_and_show_scores(uni_tf_X, y_train, 'Unigram Tf-Idf')
bi_clf = train_and_show_scores(bi_X, y_train, 'Bigram Counts')
bi_tf_clf = train_and_show_scores(bi_tf_X, y_train, 'Bigram Tf-Idf')

best params  {'max_iter': 5000}
Classification report                precision    recall  f1-score   support

           0       0.27      0.54      0.36       509
           1       0.25      0.55      0.34      1771
           2       0.92      0.64      0.76     17369
           3       0.35      0.57      0.43      3065
           4       0.31      0.59      0.40       697

    accuracy                           0.62     23411
   macro avg       0.42      0.58      0.46     23411
weighted avg       0.76      0.62      0.66     23411

Accuracy score:  0.622058006919824
F1 macro score:  0.45923272517222635
F1 weighted score:  0.5806831077750069
best params  {'max_iter': 100}
Classification report                precision    recall  f1-score   support

           0       0.09      0.53      0.15       173
           1       0.09      0.49      0.16       751
           2       0.96      0.58      0.72     20151
           3       0.22      0.54      0.31      1973
           4       0

In [6]:
final_clf = bi_clf

### Create your Predictions in the cell below

In [7]:
y_test_pred = final_clf.predict(bi_test_X)

### Perform the final evaluation of the Performance of your model in the cell below

In [8]:
    print('Accuracy score: ', accuracy_score(y_test, y_test_pred))
    print('F1 macro score: ', f1_score(y_test, y_test_pred, average='macro', zero_division=1))
    print('F1 weighted score: ', f1_score(y_test, y_test_pred, average='weighted', zero_division=1))


Accuracy score:  0.6871769680919226
F1 macro score:  0.5791748046387871
F1 weighted score:  0.6687180580434668
