In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
import joblib

In [3]:
data = pd.read_csv('../dataset/imdb_labelled.txt', sep='\t', header=None, names=['Review', 'Sentiment'])
data.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
data.sample(5)

Unnamed: 0,Review,Sentiment
680,The interplay between Martin and Emilio contai...,1
270,I knew when I saw the film that more great thi...,1
345,"However, after seeing the short again after ab...",1
188,"After watching this film, I wanted to learn mo...",1
550,I'm a big fan of this series mostly due to Ann...,1


In [5]:
X = data['Review']
y = data['Sentiment']

In [6]:
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([362, 386], dtype=int64))

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize,
                        stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 1)
                       )
count_vec = CountVectorizer(tokenizer=nltk.word_tokenize,
                            stop_words='english',
                            lowercase=True,
                            ngram_range=(1, 1)
                           )

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=666)

In [11]:
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)



In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [13]:
model_gauss = GaussianNB()
model_rand = RandomForestClassifier(n_jobs=-1)

In [14]:
model_gauss.fit(X_train.toarray(), y_train)
model_rand.fit(X_train.toarray(), y_train)

In [15]:
y_pred_gauss = model_gauss.predict(X_test.toarray())
y_pred_rand = model_rand.predict(X_test.toarray())

In [16]:
from sklearn.metrics import classification_report

In [17]:
print(classification_report(y_test, y_pred_gauss))

              precision    recall  f1-score   support

           0       0.61      0.71      0.66        55
           1       0.67      0.57      0.62        58

    accuracy                           0.64       113
   macro avg       0.64      0.64      0.64       113
weighted avg       0.64      0.64      0.64       113



In [18]:
print(classification_report(y_test, y_pred_rand))

              precision    recall  f1-score   support

           0       0.66      0.78      0.72        55
           1       0.75      0.62      0.68        58

    accuracy                           0.70       113
   macro avg       0.71      0.70      0.70       113
weighted avg       0.71      0.70      0.70       113



In [19]:
sentence = 'I am hungry, I cannot live without food.'

sent_vectorised = tfidf.transform([sentence])

In [20]:
model_rand.predict(sent_vectorised)

array([0], dtype=int64)

In [22]:
joblib.dump(tfidf, 'tfidf_sentiment.pkl')

['tfidf_sentiment.pkl']

In [23]:
joblib.dump(model_rand, 'random_forest_sentiment.pkl')

['random_forest_sentiment.pkl']

In [21]:
'I am gaoing abroad so that i can study'

'I am gaoing abroad so that i can study'