In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np
import re

In [26]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [27]:
#New column for classification labels, either 1 or 0
def scoreCol(df, text_field):
    scores = []
    for x in df[text_field]:
        if x == 'positive':
            x = 1
            scores.append(x)
        elif x == 'negative':
            x = 0
            scores.append(x)
        elif x == 'neutral':
            x = -1
            scores.append(x)
    df['scores'] = scores
    return df

In [28]:
#Read data into dataframe
data = pd.read_csv(r'rawData.csv')

In [29]:
#Call classification label method
data = scoreCol(data,'sentiment')

In [30]:
listOfTokens = []
for tokensUnparsed in data["text"].tolist():
    tokensUnparsed = re.sub("(\\'|'|\\\")", '"', tokensUnparsed)
    listOfTokens.append(tokensUnparsed[2:-2].split('", "'))
print(listOfTokens[:5])

[['i', '`', 'd', 'have', 'responded', 'if', 'i', 'were', 'going'], ['sooo', 'sad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san', 'diego'], ['my', 'boss', 'is', 'bullying', 'me'], ['what', 'interview', 'leave', 'me', 'alone'], ['sons', 'of', 'why', 'couldn', '`', 't', 'they', 'put', 'them', 'on', 'the', 'releases', 'we', 'already', 'bought']]


In [31]:
all_words = [token for tokens in listOfTokens for token in tokens]
sentence_lengths = [len(tokens) for tokens in listOfTokens]
Vocabulary = sorted(list(set(all_words)))
print("%s tokens total, with a vocabulary size of %s" % (len(all_words), len(Vocabulary)))

378839 tokens total, with a vocabulary size of 26323


In [32]:
# Create list_corpus
token_list = data['text'].tolist()
list_corpus = []
for tokens in token_list:
    tokens = tokens.replace("[","").replace("]","").replace("'","").replace(",","")
    list_corpus.append(tokens)
print(list_corpus[:5])

# Get labels
list_labels = []
for l in data['scores'].tolist():
    label = int(l)
    list_labels.append(label)

['i ` d have responded if i were going', 'sooo sad i will miss you here in san diego', 'my boss is bullying me', 'what interview leave me alone', 'sons of why couldn ` t they put them on the releases we already bought']


In [33]:
#Do a check...
print(list_corpus[:5])
print(list_labels[:5])

['i ` d have responded if i were going', 'sooo sad i will miss you here in san diego', 'my boss is bullying me', 'what interview leave me alone', 'sons of why couldn ` t they put them on the releases we already bought']
[-1, 0, 0, 0, 0]


In [34]:
# Convert all review documents to a sparse matrix of token counts
vectorizer = CountVectorizer() 
termDocumentMatrix = vectorizer.fit_transform(list_corpus)

In [35]:
#Split the training dataset into two sections:
x_train, x_test, y_train, y_test = train_test_split(termDocumentMatrix, list_labels, test_size = 0.2)

In [36]:
#Create a Multinominal Classifier
mnb = MultinomialNB()

In [37]:
#Train the model using the training sets
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
#Predict the response for test dataset
y_predicted_counts = mnb.predict(x_test)

In [39]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)

In [40]:
print("data = cleaned Dataset: vectorizer = CountVectorizer, accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
metrics.confusion_matrix(y_test,y_predicted_counts)

data = cleaned Dataset: vectorizer = CountVectorizer, accuracy = 0.651, precision = 0.655, recall = 0.651, f1 = 0.651


array([[1538,  375,  369],
       [ 543,  935,   87],
       [ 459,   86, 1105]], dtype=int64)

### Perform Cross fold validation

In [41]:
# Convert all review documents to a sparse matrix of token counts
vectorizer = CountVectorizer() 
termDocumentMatrix = vectorizer.fit_transform(list_corpus)

In [42]:
print(len(vectorizer.get_feature_names()))

26283


In [43]:
print(termDocumentMatrix.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [44]:
real_pred = pd.DataFrame({'y_true':y_test, 'y_pred':y_predicted_counts})
real_pred

Unnamed: 0,y_true,y_pred
0,1,1
1,0,0
2,0,-1
3,-1,-1
4,-1,-1
...,...,...
5492,1,1
5493,1,1
5494,1,1
5495,1,-1


In [45]:
cv_results = cross_val_score(mnb, x_train, y_train, scoring = 'accuracy', cv=5, n_jobs=-1)

In [46]:
cv_results

array([0.63907209, 0.63861724, 0.62952013, 0.6497612 , 0.63785259])

In [47]:
print("Accuracy archived: {0} %".format( round(accuracy_score(real_pred['y_true'],real_pred['y_pred']), 2)))

Accuracy archived: 0.65 %


In [48]:
print(classification_report(real_pred['y_true'],real_pred['y_pred']))

              precision    recall  f1-score   support

          -1       0.61      0.67      0.64      2282
           0       0.67      0.60      0.63      1565
           1       0.71      0.67      0.69      1650

    accuracy                           0.65      5497
   macro avg       0.66      0.65      0.65      5497
weighted avg       0.65      0.65      0.65      5497

