In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('../data/movie.csv')

In [3]:
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [5]:
df['label'].value_counts()

label
0    20019
1    19981
Name: count, dtype: int64

In [6]:
df['text'].str.lower()

0        i grew up (b. 1965) watching and loving the th...
1        when i put this movie in my dvd player, and sa...
2        why do people who do not know what a particula...
3        even though i have great interest in biblical ...
4        im a die hard dads army fan and nothing will e...
                               ...                        
39995    "western union" is something of a forgotten cl...
39996    this movie is an incredible piece of work. it ...
39997    my wife and i watched this movie because we pl...
39998    when i first watched flatliners, i was amazed....
39999    why would this film be so good, but only gross...
Name: text, Length: 40000, dtype: object

## preprocessing

In [7]:
import re

In [8]:
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-z A-Z 0-9\s]', '', x))

In [9]:
df.head()

Unnamed: 0,text,label
0,I grew up b 1965 watching and loving the Thund...,0
1,When I put this movie in my DVD player and sat...,0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [10]:
df['text'] = df['text'].apply(lambda x:x.lower())

In [11]:
df.head()

Unnamed: 0,text,label
0,i grew up b 1965 watching and loving the thund...,0
1,when i put this movie in my dvd player and sat...,0
2,why do people who do not know what a particula...,0
3,even though i have great interest in biblical ...,0
4,im a die hard dads army fan and nothing will e...,1


In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# nltk.download('punkt_tab')
# nltk.download('stopwords')


In [13]:
# Initialize stemmer and stopword list
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Tokenize
    words = word_tokenize(text.lower())
    
    # Remove stopwords and apply stemming
    filtered_words = [stemmer.stem(word) for word in words if word.isalnum() and word not in stop_words]
    
    # Join back into a string
    return ' '.join(filtered_words)


In [14]:
df['cleaned_text'] = df['text'].apply(clean_text)


In [15]:
df.head()

Unnamed: 0,text,label,cleaned_text
0,i grew up b 1965 watching and loving the thund...,0,grew b 1965 watch love thunderbird mate school...
1,when i put this movie in my dvd player and sat...,0,put movi dvd player sat coke chip expect hope ...
2,why do people who do not know what a particula...,0,peopl know particular time past like feel need...
3,even though i have great interest in biblical ...,0,even though great interest biblic movi bore de...
4,im a die hard dads army fan and nothing will e...,1,im die hard dad armi fan noth ever chang got t...


In [16]:
df.drop(columns='text',inplace=True)

In [17]:
df.head()

Unnamed: 0,label,cleaned_text
0,0,grew b 1965 watch love thunderbird mate school...
1,0,put movi dvd player sat coke chip expect hope ...
2,0,peopl know particular time past like feel need...
3,0,even though great interest biblic movi bore de...
4,1,im die hard dad armi fan noth ever chang got t...


## Splitting Data Train, Test and Split

In [18]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import numpy as np 

In [19]:
x = df[['cleaned_text']]
y = df['label']

In [20]:
x

Unnamed: 0,cleaned_text
0,grew b 1965 watch love thunderbird mate school...
1,put movi dvd player sat coke chip expect hope ...
2,peopl know particular time past like feel need...
3,even though great interest biblic movi bore de...
4,im die hard dad armi fan noth ever chang got t...
...,...
39995,western union someth forgotten classic western...
39996,movi incred piec work explor everi nook cranni...
39997,wife watch movi plan visit sicili stromboli so...
39998,first watch flatlin amaz necessari featur good...


In [21]:
y.head()

0    0
1    0
2    0
3    0
4    1
Name: label, dtype: int64

In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [23]:
x_train.head()

Unnamed: 0,cleaned_text
26898,fifth grade languag art teacher read book stud...
27635,low budget brit pop melodrama focus girl want ...
3036,well ok watch movi littl 2 year ago pull dusti...
5604,would almost give 10 10 howev confus part well...
36111,full length featur film world bridg found firs...


In [24]:
y_train.head()

26898    0
27635    1
3036     0
5604     1
36111    1
Name: label, dtype: int64

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
counter = CountVectorizer(max_features=10000, stop_words='english')  # optional limit
x_train_counter = counter.fit_transform(x_train['cleaned_text'])
x_test_counter = counter.transform(x_test['cleaned_text'])

In [27]:
x_train_counter

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2333252 stored elements and shape (30000, 10000)>

In [28]:
# To see the feature name at index 0
first_feature = counter.get_feature_names_out()[0]
print(first_feature)

007


In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf = RandomForestClassifier(n_estimators=100)

In [31]:
rf.fit(x_train_counter,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
y_pred = rf.predict(x_test_counter)

In [33]:
from sklearn.metrics import accuracy_score,f1_score

In [34]:
y_test.head()

32823    0
16298    1
28505    0
6689     1
26893    1
Name: label, dtype: int64

In [35]:
y_pred

array([1, 1, 1, ..., 0, 0, 0], shape=(10000,))

In [36]:
score = accuracy_score(y_test,y_pred)

In [37]:
print(score)

0.8482


In [38]:
# import pickle

In [41]:
# with open('model.pkl','wb') as file:
#     pickle.dump(rf,file)

In [42]:
# with open('preprocessor.pkl','wb') as file:
#     pickle.dump(counter,file)

In [49]:
input_str = ["""I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.
"""]

In [50]:
input_str

['I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.\n']

In [51]:
input_str = counter.transform(input_str)

In [52]:
input_str

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 31 stored elements and shape (1, 10000)>

In [53]:
output = rf.predict(input_str)

In [54]:
output

array([0])

In [55]:
input_str2 = ["""Im a die hard Dads Army fan and nothing will ever change that. I got all the tapes, DVD's and audiobooks and every time i watch/listen to them its brand new. <br /><br />The film. The film is a re run of certain episodes, Man and the hour, Enemy within the gates, Battle School and numerous others with a different edge. Introduction of a new General instead of Captain Square was a brilliant move - especially when he wouldn't cash the cheque (something that is rarely done now).<br /><br />It follows through the early years of getting equipment and uniforms, starting up and training. All in all, its a great film for a boring Sunday afternoon. <br /><br />Two draw backs. One is the Germans bogus dodgy accents (come one, Germans cant pronounced the letter "W" like us) and Two The casting of Liz Frazer instead of the familiar Janet Davis. I like Liz in other films like the carry ons but she doesn't carry it correctly in this and Janet Davis would have been the better choice.
"""]

In [56]:
input_str2 = counter.transform(input_str2)

In [57]:
output = rf.predict(input_str2)

In [58]:
output

array([1])