In [7]:
import numpy as np
import pandas as pd
df = pd.read_csv("../moviereviews.tsv",sep='\t')
print("Total starting entries: \n", len(df))
print("null values before removal: \n", df.isnull().sum())

#drop missing data
df.dropna(inplace=True)
print("\n Total entries after naive isnull removal: \n", len(df))
print("Null values after naive isnull removal: \n", df.isnull().sum())

#Drop empty string data, or blank reviews
blanks = []
#each object in df has (index,label,review text)
for i,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i) #colllect index positions of reviews that are empty
df.dropna(blanks,inplace=True)
print("\n Total entries after blank review removal: \n", len(df))
print("Null values after blank review removal: \n", df.isnull().sum())

Total starting entries: 
 2000
null values before removal: 
 label      0
review    35
dtype: int64

 Total entries after naive isnull removal: 
 1965
Null values after naive isnull removal: 
 label     0
review    0
dtype: int64

 Total entries after blank review removal: 
 1938
Null values after blank review removal: 
 label     0
review    0
dtype: int64


In [12]:
#display value counts for each label in dataset
print("Value counts: \n", df['label'].value_counts())

#perform train-test data split
from sklearn.model_selection import train_test_split
X=df['review']
y=df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

#create a data pipeline to train and fit the model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC #Linear support vector classifier

#pipeline input is a list of tuples specifying components
text_clf = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf',LinearSVC())])
text_clf.fit(X_train,y_train)
#predict test data for comparison 
predictions = text_clf.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

Value counts: 
 neg    969
pos    969
Name: label, dtype: int64
[[235  47]
 [ 41 259]]
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

   micro avg       0.85      0.85      0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582

0.8487972508591065
