Importing dataset and libraries

In [None]:
#Library import/download
import pandas as pd
import numpy as np
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import string
import matplotlib.pyplot as plt
import itertools
from textblob import TextBlob
import pickle
from urllib.parse import urlparse
import requests
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Mounting google drive(Will only function if the file is shared with your google account)
from google.colab import drive
drive.mount('/content/drive')
#data import
df = pd.read_csv("drive/My Drive/movie_data.csv")
#Inspecting the first ten lines we can conclude that zero is negative and 1 is positive
df.head(10)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,review,sentiment
0,This movie is just crap. Even though the direc...,0
1,Another detailed work on the subject by Dr Dwi...,1
2,THE CAT O'NINE TAILS (Il Gatto a Nove Code) <b...,0
3,"Like with any movie genre, there are good gang...",0
4,I watched it with my mom and we were like...<b...,0
5,This movie is probably one of 3 worst movies m...,0
6,"this movie is quite bad, aggressive, not playe...",0
7,And a perfect film to watch during the holiday...,1
8,"I like Noel Coward, the wit. I like Noel Cowar...",0
9,"""The Days"" is a typical family drama with a li...",1


Functions: Model performance and data preprocessing

In [None]:
#Function for model performance(Confusion matrix and classification)
def model_performance(X_train: np.ndarray, y_train: np.ndarray,
                      X_test: np.ndarray, y_test: np.ndarray, model):
    model.fit(X_train, y_train) 
    predicted = model.predict(X_test)
    try:
        probs = model.predict_proba(X_test)
    except AttributeError:
        pass
    print('\nClassification Report:')
    print(classification_report(y_test, predicted))
    print('\nConfusion Matrix:')
    print(confusion_matrix(y_test, predicted))

porter = PorterStemmer()
#Preprocessor model. Applied to all data to make words with the same meaning 
#being spelled the same way and all in lower.
def preprocessor(text):
  text =re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
  return[porter.stem(word) for word in text.split()]


#Just an empty function as we could not make the tfdidf work with no 
#tokenizer input, but the data is already tokenized with the preprocessor
def fill(text):
  return text



In [None]:
df['review'] = df['review'].apply(preprocessor)
#print(df['review'])


#Vectorization:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, tokenizer=fill, use_idf=True, norm='l2', smooth_idf=True)
y = df.sentiment.values
X = tfidf.fit_transform(df.review)

#Splitting into test and train sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3, shuffle=False)
clf = LogisticRegressionCV(cv=5, scoring="accuracy", random_state=1, n_jobs=-1, verbose=3,max_iter=300).fit(X_train, y_train)

#Running the clf.predict functions which is the function that predict the sentiment of the review
yhat = clf.predict(X_test)

#Printing clf score(Accuracy) and running the model performance function to get a better insight into the result.
print("accuracy:")
print(clf.score(X_test, y_test))

model_performance(X_train, y_train, X_test, y_test, clf)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished


accuracy:
0.8993396037622573


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished



Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      7512
           1       0.89      0.91      0.90      7479

    accuracy                           0.90     14991
   macro avg       0.90      0.90      0.90     14991
weighted avg       0.90      0.90      0.90     14991


Confusion Matrix:
[[6685  827]
 [ 682 6797]]


**Webscraped data**

We use the same data as in the first model, combined with new data we scrape from imdb and try to predict the sentiment of the new data. 

In [None]:
#The movie "Joker" IMBD review page
url_link='https://www.imdb.com/title/tt7286456/reviews'
html=urlopen(url_link)

#Transforming the page into to beutifulsoup
content_bs=BeautifulSoup(html)

#Empty list to append the reviews to
JokerReviews = []

#All the reviews ends in a div class called text in html, can be found in the imdb source code
for b in content_bs.find_all('div',class_='text'):
  JokerReviews.append(b)

df2 = pd.DataFrame.from_records(JokerReviews)
jokerdata2 = pd.DataFrame(df2[0])
#Adding sentimen column in order to be able to append this data directly to 
#the original dataset
jokerdata2['sentiment'] = "0"
jokerdata2['dataset'] = "Webscraped"
#Renameing the 0 column to review, again in order to be able to append the data
#directly to the original dataset.
jokerdata2.rename(columns={"0": "review"})
#Loading the original dataset
df3 = pd.read_csv("drive/My Drive/movie_data.csv")
df3['dataset'] = "Original"
jokerdata2.rename(columns={0:'review'}, inplace=True)

In [None]:
#Checking that we have matching columns
print(jokerdata2.columns)
print(df3.columns)

#Checking that we get the expected shape
totalstack = jokerdata2.append(df3)
print(totalstack.shape)
print(df2)

Index(['review', 'sentiment', 'dataset'], dtype='object')
Index(['review', 'sentiment', 'dataset'], dtype='object')
(49994, 3)
                                                    0  ...                  24
0   I was a person that saw all the hype and claim...  ...                None
1   Every once in a while a movie comes, that trul...  ...                None
2   This is a movie that only those who have felt ...  ...                None
3   Truly a masterpiece, The Best Hollywood film o...  ...                None
4   Joaquin Phoenix gives a tour de force performa...  ...                None
5   Most of the time movies are anticipated like t...  ...                None
6   Let me start off by saying if Joaquin Phoneix ...  ...                None
7   I get why some people hate this . It's because...  ...                None
8   I have seen Joker yesterday at Venice an early...  ...                None
9   It's sad that Joaquin missed Oscar for 'The gl...  ...                None
10  

In [None]:
#applying preprocessing
totalstack['review'] = totalstack['review'].apply(preprocessor)
print(totalstack['review'])

In [None]:
#Vectorization of the entire data
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, tokenizer=fill, use_idf=True, norm='l2', smooth_idf=True)
y = totalstack.sentiment.values
X = tfidf.fit_transform(totalstack.review)

#Getting the  length of the webscraped data.
n = len(df2)

#Using n to sort out the webscraped data, and storing it as test data.
#Rest in training
test_y = y[:n] 
test_X = X[:n]
train_y=y[n:]
train_X=X[n:]
#Chaning types to int
train_y=train_y.astype('int')
test_y=test_y.astype('int')

In [None]:
clf = LogisticRegressionCV(cv=5, scoring="accuracy", random_state=1, n_jobs=-1, verbose=3,max_iter=300).fit(train_X, train_y)
yhat = clf.predict(test_X)
print(yhat)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.7min finished


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1]
