<h1>Project Computer Vision Lecture</h1>
<h2>Title: Movie Review Sentiment Analysis</h2>

1. Daniel Santoso / 2201756506
2. Boban Nathaniel Seputra / 2201762540
3. Luwis Lim / 2201761771
4. Steven Odolf Yuwono / 2201758045

In [8]:
# Import libraries

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [9]:
# Load csv and detect null values

df = pd.read_csv('IMDB Dataset.csv', delimiter=',')
print(df.isnull().values.any())

y = df.sentiment

print(df)

False
                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [10]:
# Make all lower case

df['review'] = df['review'].str.lower()
print(df)

                                                  review sentiment
0      one of the other reviewers has mentioned that ...  positive
1      a wonderful little production. <br /><br />the...  positive
2      i thought this was a wonderful way to spend ti...  positive
3      basically there's a family where a little boy ...  negative
4      petter mattei's "love in the time of money" is...  positive
...                                                  ...       ...
49995  i thought this movie did a down right good job...  positive
49996  bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  i am a catholic taught in parochial elementary...  negative
49998  i'm going to have to disagree with the previou...  negative
49999  no one expects the star trek movies to be high...  negative

[50000 rows x 2 columns]


In [11]:
# Remove numbers and punctuation

import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

df['review'] = df['review'].apply(lambda x: re.sub(r"\d+", "", x))
df['review'] = preprocess_reviews(df['review'])
print(df)

                                                  review sentiment
0      one of the other reviewers has mentioned that ...  positive
1      a wonderful little production  the filming tec...  positive
2      i thought this was a wonderful way to spend ti...  positive
3      basically theres a family where a little boy j...  negative
4      petter matteis love in the time of money is a ...  positive
...                                                  ...       ...
49995  i thought this movie did a down right good job...  positive
49996  bad plot bad dialogue bad acting idiotic direc...  negative
49997  i am a catholic taught in parochial elementary...  negative
49998  im going to have to disagree with the previous...  negative
49999  no one expects the star trek movies to be high...  negative

[50000 rows x 2 columns]


In [12]:
# Remove stopwords and split into train, val, and test data

review = df['review']

stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(review)
X = ngram_vectorizer.transform(review)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size = 0.8
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size = 0.8
)

print(X_train)
print(y_train)
print(X_val)
print(y_val)
print(X_test)
print(y_test)

  (0, 7552)	1
  (0, 31524)	1
  (0, 31664)	1
  (0, 332340)	1
  (0, 358903)	1
  (0, 358950)	1
  (0, 366684)	1
  (0, 453230)	1
  (0, 453346)	1
  (0, 489280)	1
  (0, 489283)	1
  (0, 505627)	1
  (0, 505687)	1
  (0, 506065)	1
  (0, 506066)	1
  (0, 512511)	1
  (0, 512605)	1
  (0, 540414)	1
  (0, 540425)	1
  (0, 550503)	1
  (0, 550514)	1
  (0, 578402)	1
  (0, 578427)	1
  (0, 612383)	1
  (0, 613444)	1
  :	:
  (31999, 8819339)	1
  (31999, 9173112)	1
  (31999, 9180047)	1
  (31999, 9180321)	1
  (31999, 9180472)	1
  (31999, 9228343)	1
  (31999, 9231931)	1
  (31999, 9231943)	1
  (31999, 9263382)	1
  (31999, 9264446)	1
  (31999, 9265117)	1
  (31999, 9274218)	1
  (31999, 9274219)	1
  (31999, 9289823)	1
  (31999, 9294219)	1
  (31999, 9294331)	1
  (31999, 9335262)	1
  (31999, 9335384)	1
  (31999, 9335398)	1
  (31999, 9353070)	1
  (31999, 9419343)	1
  (31999, 9419987)	1
  (31999, 9472208)	1
  (31999, 9472519)	1
  (31999, 9472846)	1
18518    positive
22678    negative
29926    negative
28163    positive
3

In [13]:
# Train using different regularization parameter

for c in [1, 0.5, 0.1, 0.05, 0.01]:
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, svm.predict(X_val))))



Accuracy for C=1: 0.90375
Accuracy for C=0.5: 0.90375
Accuracy for C=0.1: 0.903875
Accuracy for C=0.05: 0.904125
Accuracy for C=0.01: 0.9065


In [15]:
# Use the best regularization parameter to predict using test data

svm = LinearSVC(C=0.01)
svm.fit(X_train, y_train)
print ("Accuracy for C=%s: %s" % (0.01, accuracy_score(y_test, svm.predict(X_test))))

Accuracy for C=0.01: 0.9028
