In [1]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('data/reviews.csv')

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [5]:
df[df.isnull()]

Unnamed: 0,label,review
0,,
1,,
2,,
3,,
4,,
...,...,...
1995,,
1996,,
1997,,
1998,,


In [6]:
df.dropna()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [7]:
df = df.dropna(subset=['review'], axis=0)

In [8]:
df['review'] = df['review'].apply(lambda rev: rev.strip())

In [9]:
len(df[df['review']==''])

27

In [10]:
df[df['review']=='']

Unnamed: 0,label,review
57,neg,
71,pos,
147,pos,
151,pos,
283,pos,
307,pos,
313,neg,
323,pos,
343,pos,
351,neg,


In [11]:
df[df['review']==''].index

Int64Index([  57,   71,  147,  151,  283,  307,  313,  323,  343,  351,  427,
             501,  633,  675,  815,  851,  977, 1079, 1299, 1455, 1493, 1525,
            1531, 1763, 1851, 1905, 1993],
           dtype='int64')

In [12]:
df.drop(df[df['review']==''].index)

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [13]:
df = df.drop(df[df['review']==''].index)

In [14]:
len(df[df['review']==''])

0

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1938 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1938 non-null   object
 1   review  1938 non-null   object
dtypes: object(2)
memory usage: 45.4+ KB


In [16]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = df['review']
y = df['label']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [20]:
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
from sklearn.pipeline import Pipeline

In [22]:
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('linsvc', LinearSVC())])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('linsvc', LinearSVC())])

In [26]:
preds = pipe.predict(X_test)

In [24]:
from sklearn.metrics import classification_report, confusion_matrix

In [28]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         neg       0.81      0.86      0.83       191
         pos       0.85      0.81      0.83       197

    accuracy                           0.83       388
   macro avg       0.83      0.83      0.83       388
weighted avg       0.83      0.83      0.83       388



In [29]:
confusion_matrix(y_test, preds)

array([[164,  27],
       [ 38, 159]], dtype=int64)