In [2]:
import re

import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict
import datasets as ds

dataset = ds.load_dataset('glue','sst2',split='train')

dataset


Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [4]:
data = dataset.to_pandas()
data

Unnamed: 0,sentence,label,idx
0,hide new secretions from the parental units,0,0
1,"contains no wit , only labored gags",0,1
2,that loves its characters and communicates som...,1,2
3,remains utterly satisfied to remain the same t...,0,3
4,on the worst revenge-of-the-nerds clichés the ...,0,4
...,...,...,...
67344,a delightful comedy,1,67344
67345,"anguish , anger and frustration",0,67345
67346,"at achieving the modest , crowd-pleasing goals...",1,67346
67347,a patient viewer,1,67347


In [5]:
data.isnull().sum()

sentence    0
label       0
idx         0
dtype: int64

In [6]:
data.shape

(67349, 3)

In [7]:
data = data.drop(['idx'],axis=1)
data

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...
67344,a delightful comedy,1
67345,"anguish , anger and frustration",0
67346,"at achieving the modest , crowd-pleasing goals...",1
67347,a patient viewer,1


In [8]:
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['sentence'] = data['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
data

Unnamed: 0,sentence,label
0,hide new secretions parental units,0
1,"contains wit , labored gags",0
2,loves characters communicates something rather...,1
3,remains utterly satisfied remain throughout,0
4,worst revenge-of-the-nerds clichés filmmakers ...,0
...,...,...
67344,delightful comedy,1
67345,"anguish , anger frustration",0
67346,"achieving modest , crowd-pleasing goals sets",1
67347,patient viewer,1


In [9]:
data['sentence'].str.lower()
data

Unnamed: 0,sentence,label
0,hide new secretions parental units,0
1,"contains wit , labored gags",0
2,loves characters communicates something rather...,1
3,remains utterly satisfied remain throughout,0
4,worst revenge-of-the-nerds clichés filmmakers ...,0
...,...,...
67344,delightful comedy,1
67345,"anguish , anger frustration",0
67346,"achieving modest , crowd-pleasing goals sets",1
67347,patient viewer,1


In [10]:
import string
exclude = string.punctuation

In [11]:
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [12]:
data['sentence']=data['sentence'].apply(remove_punc)

In [13]:
(data['label']==-1).any()

False

In [14]:
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# data['sentence']=data['sentence'].apply(word_tokenize)
import nltk
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['sentence'] = data['sentence'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dattu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
data

Unnamed: 0,sentence,label
0,hide new secretion parental unit,0
1,contains wit labored gag,0
2,love character communicates something rather b...,1
3,remains utterly satisfied remain throughout,0
4,worst revengeofthenerds clichés filmmaker coul...,0
...,...,...
67344,delightful comedy,1
67345,anguish anger frustration,0
67346,achieving modest crowdpleasing goal set,1
67347,patient viewer,1


In [16]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [17]:
data['sentence'] = data['sentence'].apply(stem_words)

In [18]:
data

Unnamed: 0,sentence,label
0,hide new secret parent unit,0
1,contain wit labor gag,0
2,love charact commun someth rather beauti human...,1
3,remain utterli satisfi remain throughout,0
4,worst revengeofthenerd cliché filmmak could dredg,0
...,...,...
67344,delight comedi,1
67345,anguish anger frustrat,0
67346,achiev modest crowdpleas goal set,1
67347,patient viewer,1


In [19]:
sentences = data['sentence'].values
labels = data['label'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, encoded_labels, stratify = encoded_labels)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(train_sentences)


In [21]:
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 50511, n_features: 10596


In [22]:
X_test_tf = tf_idf.transform(test_sentences)
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 16838, n_features: 10596


In [23]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_tf,train_labels)

In [24]:
y_pred = model.predict(X_test_tf)

In [25]:
y_pred

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

In [26]:
from sklearn import metrics
print(metrics.classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85      7445
           1       0.87      0.91      0.89      9393

    accuracy                           0.87     16838
   macro avg       0.87      0.87      0.87     16838
weighted avg       0.87      0.87      0.87     16838



In [27]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_labels, y_pred))


Confusion matrix:
[[6161 1284]
 [ 853 8540]]


In [28]:
test = "The customer service was outstanding, and the product was exactly what I was looking for"

def func(text):
    return ' '.join([word for word in text.split() if word not in (stop_words)])
test = func(test)
test

'The customer service outstanding, product exactly I looking'

In [29]:
test = remove_punc(test)
test

'The customer service outstanding product exactly I looking'

In [30]:
test = lemmatize_text(test)

In [31]:
test = stem_words(test)
test
test_proccessed = [''.join(test)]

In [32]:
test_input = tf_idf.transform(test_proccessed)
test_input

<1x10596 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [33]:
test_input.shape

(1, 10596)

In [34]:
result = model.predict(test_input)[0]
if result:
    print('Positive Review')
else:
    print('Negative Review')

Positive Review


In [35]:
def test_input_prediction(text):
    text = ' '.join([word for word in text.split() if word not in (stop_words)])
    text = remove_punc(text)
    text = lemmatize_text(text)
    text = stem_words(text)
    text_proccessed = [''.join(text)]
    text_input = tf_idf.transform(text_proccessed)
    result = model.predict(text_input)[0]
    if result:
        print('Positive Review')
    else:
        print('Negative Review')
    return

In [36]:
test_input_prediction("it was too bad")

Negative Review


In [37]:
test_input_prediction("I haven't come accross such a movie.The plot was good but it could have been improved a lot")

Positive Review


In [38]:
test_input_prediction("hi Handsome")

Positive Review


In [39]:
test_input_prediction("alwyas- is the correct spelling")

Positive Review


In [40]:
test_input_prediction("hi stupid")

Negative Review


In [41]:
test_input_prediction("alwyas- is the wrong spelling")

Positive Review


In [42]:
test_input_prediction("shwetha is ugly")

Negative Review
