### Task #1: Perform imports and load the dataset into a pandas DataFrame

In [1]:
import pandas as pd

df = pd.read_csv('moviereviews2.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


### Task #2: Check for missing values:

In [2]:
# Check for NaN values:
df.isnull().sum()

label      0
review    20
dtype: int64

In [3]:
df.dropna(inplace=True)

In [4]:
# Check for whitespace strings (it's OK if there aren't any!):
blanks = []  

for i,lb,rv in df.itertuples():  
    if rv.isspace():        
        blanks.append(i)     
        
len(blanks)

0

### Task #3: Take a quick look at the `label` column:


In [5]:
df['label'].value_counts()

pos    2990
neg    2990
Name: label, dtype: int64

### Task #4: Split the data into train & test sets:

In [6]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Task #5: Build a pipeline to vectorize the date, then train and fit a model

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

### Task #6: Run predictions and analyze the results


In [8]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [9]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[900  91]
 [ 63 920]]


In [10]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [11]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.9219858156028369


### Task #7: IMDB Review Analysis on the model 

In [12]:
# Loki - TV Show
text_clf.predict(["Dont know about rest 5 but 1st episode was ruthlessly fantastic Loki truly a mischief"])

array(['pos'], dtype=object)

In [14]:
# Family - TV Show
text_clf.predict(["What a series! Mind blowing acting, screenplay! The last scene of the last episode, it sounds like another season will come surely"])

array(['pos'], dtype=object)

In [15]:
# Radhe - Movie
text_clf.predict(["Salman has competition from himself to make worst movie. And he won this time too"])

array(['neg'], dtype=object)

In [17]:
# Naruto - TV Show
text_clf.predict(["Everyday after school I went to the store and bought a bottle of soda and some snacks, and watched Naruto when i came home. Some months later I was borderline obese. 10/10 would recommend."])

array(['pos'], dtype=object)

In [21]:
# Zero - Movie
text_clf.predict(["Who thought this would make a good film? None of the people working on the film ever had this thought? I had to walk out during the second half because of how absurd it was"])

array(['neg'], dtype=object)