In [None]:
# TfidfVectorizer + LinearSVC

In [1]:
import pandas as pd

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import pandas as pd

ds_reviews = pd.read_json('/content/drive/MyDrive/IMDB_reviews.json', lines=True)

In [None]:
ds_reviews

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"
...,...,...,...,...,...,...,...
573908,8 August 1999,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem...",10,The best teen movie of the nineties
573909,31 July 1999,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ...",9,Go - see the movie
573910,20 July 1999,tt0139239,ur0392750,False,"Go is the best movie I have ever seen, and I'v...",10,It's the best movie I've ever seen
573911,11 June 1999,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...,3,Haven't we seen this before?


In [13]:
# train dataset (22000 examples -- approxamately the same size as with neural network)
# balanced classes: 11000 spoiler reviews and 11000 no spoiler reviews

spoiler_reviews = ds_reviews.loc[ds_reviews['is_spoiler'] == True]

small_spoiler_train = spoiler_reviews.sample(n=11000)

no_spoiler_reviews = ds_reviews.loc[ds_reviews['is_spoiler'] == False]

small_no_spoiler_train = no_spoiler_reviews.sample(n=11000)

train_reviews = pd.concat([small_spoiler_train, small_no_spoiler_train], ignore_index = False)
train_reviews

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
257599,27 February 2017,tt4975722,ur36043108,True,This is quite a surprising movie on many level...,8,Under the moon of love
42236,10 June 2006,tt0081505,ur8086637,True,*******SPOILER ALERT******** Just got finished...,9,Possibly one of the most terrifying rising act...
147724,25 May 2011,tt1298650,ur26316341,True,"First things, first : the fourth installment o...",5,a disappointing end to a grande saga
155169,5 November 2011,tt0448694,ur3494091,True,"A fine, fun adventure for kids and their paren...",10,Beautiful and fun
423708,14 September 2015,tt0758758,ur60028700,True,Into the Wild (2007): Dir: Sean Penn / Cast: E...,8,Hirsch Goes Wild.
...,...,...,...,...,...,...,...
30539,4 June 1999,tt0120586,ur0337549,False,"After seeing this, I could understand why the ...",4,YET ANOTHER HIGHLY OVERRATED FILM
464007,13 January 2010,tt0988045,ur12431180,False,Guy Ritchie brings a fresh look to the legenda...,9,Elementary Good Fun
301377,17 September 2010,tt0163025,ur23794235,False,After the success of the first two Jp movies t...,10,Jurassic Park III 10 out of 10 EXCEL...
242454,23 September 2015,tt2719848,ur48675674,False,"""Everest"" (2015 release; 122 min.) brings the ...",9,Disaster movie that packs a emotional wallop


In [14]:
# test dataset (6000 examples -- approxamately the same size as with neural network)
# balanced classes

small_spoiler_test = spoiler_reviews.sample(n=3000)
small_no_spoiler_test = no_spoiler_reviews.sample(n=3000)

val_reviews = pd.concat([small_spoiler_test, small_no_spoiler_test], ignore_index = False)
val_reviews

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
371471,4 May 2006,tt0356680,ur9177788,True,When will Hollywood get it? It is rare to for ...,4,Is it over yet?!
331148,26 December 2004,tt0327056,ur1535417,True,I must say I'm REALLY disappointed by this mov...,6,"Well directed, well acted but spoiled by a sil..."
501505,26 March 2012,tt0107614,ur6918917,True,"Robin Williams, Sally Field and Pierce Brosnan...",7,"""One Of Robin's Best!"""
64994,20 January 2007,tt0041959,ur4428199,True,The Third Man is all sizzle and flimsy plot. T...,7,"All sizzle, flimsy plot"
48963,31 August 2013,tt0105236,ur46172017,True,"This movie is my favourite movie of all time, ...",10,Masterpiece
...,...,...,...,...,...,...,...
150209,21 August 2012,tt1229238,ur35537696,False,So I watched Mission Impossible: Ghost Protoco...,5,"Fun, but in the end, forgettable"
536199,22 August 2015,tt0118884,ur2898520,False,Dr. Ellie Arroway (Jodie Foster) starts work f...,7,Big ideas
261419,10 January 2017,tt1860357,ur23018536,False,"""We are a big company, millions of moving part...",8,The action is quick and relentless and a movie...
253595,30 March 2016,tt2948356,ur62417210,False,Zootopia is the newest animated movie from Wal...,10,Another modern classic from Disney


In [15]:
# preprocessing (lowercase and nltk word tokenization)

def preprocess(text):
    text = text.lower()
    text = [token for token in word_tokenize(text) if token.isalpha()]
    text = ' '.join(text)

    return text

In [16]:
# tfidf

tfidf = TfidfVectorizer(stop_words='english', min_df=3)

train_texts = train_reviews.review_text.apply(preprocess).values

X_train = tfidf.fit_transform(train_texts).toarray()
y_train = train_reviews.is_spoiler.values

In [17]:
test_texts = val_reviews.review_text.apply(preprocess).values

X_test = tfidf.transform(test_texts).toarray()
y_test = val_reviews.is_spoiler.values

In [18]:
# learning

svm = LinearSVC()
svm.fit(X_train, y_train)

In [19]:
# accuracy

print(classification_report(y_test, svm.predict(X_test)))

              precision    recall  f1-score   support

       False       0.66      0.66      0.66      3000
        True       0.66      0.66      0.66      3000

    accuracy                           0.66      6000
   macro avg       0.66      0.66      0.66      6000
weighted avg       0.66      0.66      0.66      6000

