In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline


In [2]:
imdb_df = pd.read_csv(r"C:\Users\ADMIN\datasets\imdb_dataset.csv", encoding="ISO-8859-1")

In [3]:
imdb_df.shape

(50000, 2)

In [4]:
imdb_df = imdb_df.sample(10000, replace= False)

In [5]:
imdb_df.describe()

Unnamed: 0,Review,Label
count,10000,10000
unique,9979,2
top,"I thought Rachel York was fantastic as ""Lucy.""...",pos
freq,2,5050


In [6]:
X = imdb_df['Review']
Y = imdb_df['Label']

In [7]:
X.head()

22353    I think this movie more than any other shows w...
19222    Nazarin is some kind of saint,he wants to live...
44809    This film was great.<br /><br />The plot was p...
26832    I don't know where to begin, so I'll begin wit...
Name: Review, dtype: object

In [8]:
Y.head()

22353    pos
19222    pos
47170    pos
44809    pos
26832    neg
Name: Label, dtype: object

In [9]:
count_vectorizer = CountVectorizer()
transformed_vector = count_vectorizer.fit_transform(X)

In [10]:
transformed_vector.shape

(10000, 52595)

In [11]:
print(transformed_vector[0])

  (0, 46773)	1
  (0, 46798)	2
  (0, 30994)	1
  (0, 30785)	1
  (0, 46614)	1
  (0, 2581)	1
  (0, 33220)	1
  (0, 42009)	1
  (0, 51162)	1
  (0, 20263)	2
  (0, 1103)	1
  (0, 14246)	1
  (0, 4209)	1
  (0, 24553)	2
  (0, 4486)	1
  (0, 41693)	1
  (0, 35279)	1
  (0, 48508)	1
  (0, 14399)	1
  (0, 23370)	1
  (0, 21897)	1
  (0, 40679)	1
  (0, 51215)	1
  (0, 43268)	1
  (0, 31851)	1
  (0, 23116)	1
  (0, 21723)	1
  (0, 4637)	1
  (0, 17802)	1
  (0, 51618)	1
  (0, 27797)	1
  (0, 32696)	1
  (0, 26710)	1
  (0, 13870)	1
  (0, 49638)	1
  (0, 19774)	1
  (0, 18095)	1
  (0, 46828)	1
  (0, 17153)	1
  (0, 19912)	1
  (0, 31003)	1
  (0, 6899)	1
  (0, 37716)	1
  (0, 15547)	1
  (0, 32871)	1


In [14]:
tfidf_transformer = TfidfTransformer()
tfidf_vector = tfidf_transformer.fit_transform(transformed_vector)

In [15]:
print(tfidf_vector[0])

  (0, 51618)	0.05708296104182298
  (0, 51215)	0.0928534376855517
  (0, 51162)	0.08169768666731038
  (0, 49638)	0.18269714255202893
  (0, 48508)	0.2255811067814406
  (0, 46828)	0.12225984726499456
  (0, 46798)	0.09258017171354828
  (0, 46773)	0.1069317999233832
  (0, 46614)	0.09572827230663665
  (0, 43268)	0.11900729802928554
  (0, 42009)	0.15138323286969776
  (0, 41693)	0.10730826119003875
  (0, 40679)	0.1644952070897479
  (0, 37716)	0.09189133433400971
  (0, 35279)	0.1525949708245609
  (0, 33220)	0.09669080199444084
  (0, 32871)	0.06584031993738658
  (0, 32696)	0.044256289802619524
  (0, 31851)	0.11036234599044425
  (0, 31003)	0.10650007414597512
  (0, 30994)	0.06272062875444892
  (0, 30785)	0.08548587296117018
  (0, 27797)	0.1946927640403553
  (0, 26710)	0.19888538298311556
  (0, 24553)	0.09329957660340515
  (0, 23370)	0.04734994227639232
  (0, 23116)	0.2805652531220936
  (0, 21897)	0.15104335125665955
  (0, 21723)	0.09963538836765928
  (0, 20263)	0.19910516157065966
  (0, 19912)	0.0

In [24]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vector, Y, test_size = 0.2)

In [25]:
x_train.shape, x_test.shape

((8000, 52595), (2000, 52595))

In [26]:
y_train.shape, y_test.shape

((8000,), (2000,))

In [27]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred,normalize=True)
    num_acc = accuracy_score(y_test,y_pred,normalize=True)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print("length of testing data:", len(y_test))
    print("accuracy_count:", num_acc)
    print("accuracy_score:", acc)    
    print("precision_count:", prec)
    print("recall_count:", recall)


In [28]:
clf = GaussianNB().fit(x_train.toarray(), y_train)

In [29]:
y_pred = clf.predict(x_test.toarray())

In [31]:
summarize_classification(y_test, y_pred)

length of testing data: 2000
accuracy_count: 0.615
accuracy_score: 0.615
precision_count: 0.6160873805666679
recall_count: 0.615
