In [1]:
import pandas as pd
import math
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pickle

In [2]:
imdb = pd.read_csv('IMDB_Full_Reviews.csv', index_col = 0)

In [3]:
imdb.shape

(123549, 2)

In [4]:
imdb.head()

Unnamed: 0,Review,Rating
0,I'll admit I raised an eyebrow when I saw that...,10.0
1,". . . The Riddle Maker, whose real name gets u...",9.0
2,"The Riddler(Paul Dano, spot-on. How did it tak...",9.0
3,Batman helps Lieutenant James Gordon investiga...,7.0
4,"THE BATMAN (2022) *** Robert Pattinson, Zoe Kr...",8.0


In [5]:
imdb['Rating'].value_counts()

10.0    41109
9.0     16023
8.0     13342
1.0     11111
7.0      9782
6.0      7678
5.0      6458
3.0      5114
2.0      4909
4.0      4861
Name: Rating, dtype: int64

In [6]:
imdb.tail()

Unnamed: 0,Review,Rating
123544,Loved it! I love Happy Madison films. Adam San...,9.0
123545,It could have truly been a great movie about t...,1.0
123546,"This film made me depressed, I will never get ...",1.0
123547,If it wasn't about football I'd give it a 2! H...,3.0
123548,They all played a different personality than w...,10.0


In [7]:
imdb.isnull().sum()

Review       0
Rating    3162
dtype: int64

In [8]:
imdb.dropna(inplace = True)
imdb.isnull().sum()

Review    0
Rating    0
dtype: int64

In [9]:
imdb['Rating'].value_counts()

10.0    41109
9.0     16023
8.0     13342
1.0     11111
7.0      9782
6.0      7678
5.0      6458
3.0      5114
2.0      4909
4.0      4861
Name: Rating, dtype: int64

In [10]:
imdb.drop(imdb[(imdb['Rating'] > 3) & (imdb['Rating'] < 8)].index, inplace =True)

In [11]:
imdb['Rating'].value_counts()

10.0    41109
9.0     16023
8.0     13342
1.0     11111
3.0      5114
2.0      4909
Name: Rating, dtype: int64

In [12]:
def map_rating(rating):

    if type(rating) == str:
        if rating == 'negative':
            return 0 # Negative sentiment
        else:
            return 1 # Positive sentiment
    else:
        if rating <= 3:
            return 0 
        else:
            return 1 
        
imdb['Rating'] = imdb['Rating'].apply(map_rating)
imdb.rename(columns = {'Review':'review', 'Rating':'sentiment'}, inplace = True)

In [13]:
imdb.head()

Unnamed: 0,review,sentiment
0,I'll admit I raised an eyebrow when I saw that...,1
1,". . . The Riddle Maker, whose real name gets u...",1
2,"The Riddler(Paul Dano, spot-on. How did it tak...",1
4,"THE BATMAN (2022) *** Robert Pattinson, Zoe Kr...",1
6,"Always been a ""Batman"" fan as the D. C. legend...",1


In [14]:
imdb['sentiment'].value_counts()

1    70474
0    21134
Name: sentiment, dtype: int64

In [15]:
imdb_50k = pd.read_csv('IMDB Dataset.csv')

In [16]:
imdb_50k['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [17]:
imdb_50k.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [18]:
imdb_50k['sentiment'] = imdb_50k['sentiment'].apply(map_rating)
imdb_50k['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [19]:
imdb_50k.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [20]:
imdb_neg = imdb_50k[imdb_50k['sentiment'] == 0]

In [21]:
imdb_neg['sentiment'].value_counts()

0    25000
Name: sentiment, dtype: int64

In [22]:
imdb = pd.concat([imdb, imdb_neg]).reset_index(drop = True)

In [23]:
imdb.shape

(116608, 2)

In [24]:
imdb.head()

Unnamed: 0,review,sentiment
0,I'll admit I raised an eyebrow when I saw that...,1
1,". . . The Riddle Maker, whose real name gets u...",1
2,"The Riddler(Paul Dano, spot-on. How did it tak...",1
3,"THE BATMAN (2022) *** Robert Pattinson, Zoe Kr...",1
4,"Always been a ""Batman"" fan as the D. C. legend...",1


In [25]:
imdb['sentiment'].value_counts()

1    70474
0    46134
Name: sentiment, dtype: int64

In [26]:
print(len(imdb) - len(imdb.drop_duplicates()))

942


In [27]:
imdb.drop_duplicates(inplace = True)

In [28]:
imdb['sentiment'].value_counts()[0]

45814

In [29]:
limit = imdb['sentiment'].value_counts()[0] - 1
pos = imdb[imdb['sentiment'] == 1].reset_index(drop = True)
neg = imdb[imdb['sentiment'] == 0]
new = pos.loc[:limit]
imdb = pd.concat([new, neg]).reset_index(drop = True)
imdb['sentiment'].value_counts()
# imdb.to_csv('imdb_clean_balanced.csv')

1    45814
0    45814
Name: sentiment, dtype: int64

In [30]:
stpwrds = set(stopwords.words('english'))

In [31]:
def remove_punc(text):
    # Replace sentence-ending punctuation with whitespace
    text = text.replace('.', ' ')
    text = text.replace('!', ' ')
    text = text.replace('?', ' ')
    # Replace commas and brackets with whitespace
    text = text.replace(',', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    # Replace word-dividing punctuation with whitespace to separate words
    text = text.replace('/', ' ')
    text = text.replace('-', ' ')
    # Remove the remaining punctuation from the reviews
    text = text.translate(str.maketrans('','', string.punctuation))
    return text

def fix_space(text):
    # Fix double spaces caused by replacing punctuation, and any other weird spaces
    text = text.split()
    text = " ".join(text)
    return text

def remove_html(text):
    # Remove html line breaks
    text = text.replace('<br />', '')
    # Remove all other possible html
    pattern = re.compile('<.*?>')
    text = pattern.sub(r'', text)
    return text

def remove_stopwords(text):
    # Remove common stopwords
    text = " ".join([string for string in str(text).split() if string not in stpwrds])
    return text

def lemmatize(text):
    # Convert words to their original lemma
    text = " ".join([WordNetLemmatizer().lemmatize(word) for word in text.split()])
    return text

def clean_text(text):
    text = text.lower()
    text = remove_html(text)
    text = remove_punc(text)
    text = fix_space(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

imdb['clean_review'] = imdb['review'].apply(clean_text)

In [32]:
imdb['sentiment'].value_counts()

1    45814
0    45814
Name: sentiment, dtype: int64

In [33]:
print(len(imdb) - len(imdb.drop_duplicates()))

0


In [34]:
imdb['review'][2]

"The Riddler(Paul Dano, spot-on. How did it take this long for him to get a role like this?) targets public officials, revealing their corruption, and killing them in gruesome fashion(how did this get away with a PG-13 again? Oh, right, as long as you don't show the details, then you can get away with almost anything, by now. Even a barely toned down Jigsaw). In order to stop him, it will be necessary for the Bat and the Cat(with amazing chemistry), together, to stop the rat. Thanks, Matt.I really did not think that we needed yet another film dealing with Bruce Wayne's alter-ego. How about Nightwing? I know it's not likely, but part of me still holds out hope that Grayson will be turned into a feature(if you've never checked out the trailer on YouTube, you're missing out). But somehow, this managed to convince me. Fingers crossed for at least one sequel. Robert Pattinson is incredible here. Seriously, can we just stop freaking out every time the role is recast? The closest we've come t

In [35]:
imdb['clean_review'][2]

'riddler paul dano spot take long get role like target public official revealing corruption killing gruesome fashion get away pg 13 oh right long dont show detail get away almost anything even barely toned jigsaw order stop necessary bat cat amazing chemistry together stop rat thanks matt really think needed yet another film dealing bruce wayne alter ego nightwing know likely part still hold hope grayson turned feature youve never checked trailer youtube youre missing somehow managed convince finger crossed least one sequel robert pattinson incredible seriously stop freaking every time role recast closest weve come someone shouldnt george clooney like anybody came flick looking good took job dusk till dawn good reason think going badass honestly everyone give strong performance something definitely appeal others genre tone essentially se7en meet zodiac paced like typical massive blockbuster action great especially martial art mention car chase there le smaller scale example nolan trilo

In [36]:
X_train, X_test, y_train, y_test = train_test_split(imdb['clean_review'], imdb['sentiment'],
                                                   train_size = 0.8, random_state = 21)

In [37]:
X_train.shape

(73302,)

In [38]:
X_test.shape

(18326,)

In [39]:
word_vec = TfidfVectorizer(tokenizer = word_tokenize, analyzer = 'word',
                           ngram_range = (1, 3), max_features = 10000)
word_vec.fit(imdb['clean_review'])
X_train = word_vec.transform(X_train)
X_test = word_vec.transform(X_test)



In [40]:
# eta 0.2 - 93.33%,  n_estimators = 600 - 93.39%
xgmodel = XGBClassifier(max_depth = 8, eta = 0.2, objective = 'binary:logistic', n_estimators = 600,
                       subsample = 1, eval_metric = 'auc')
xgmodel.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eta=0.2, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.200000003, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [41]:
pred = xgmodel.predict(X_test)
accuracy = metrics.accuracy_score(y_test, pred)
accuracy

0.9397577212703263

In [42]:
print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      9180
           1       0.94      0.94      0.94      9146

    accuracy                           0.94     18326
   macro avg       0.94      0.94      0.94     18326
weighted avg       0.94      0.94      0.94     18326



In [43]:
confusion = metrics.confusion_matrix(y_test, pred)
confusion

array([[8583,  597],
       [ 507, 8639]], dtype=int64)

In [44]:
pickle.dump(xgmodel, open('xgmodel.pkl', 'wb'))