In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('moviereviews.csv')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [4]:
df.isna().sum()

label      0
review    35
dtype: int64

In [5]:
df.shape

(2000, 2)

In [6]:
df.dropna(inplace =True)

In [7]:
df.shape

(1965, 2)

In [8]:
df['label'].value_counts()

label
neg    983
pos    982
Name: count, dtype: int64

In [9]:
df['review'].str.isspace().sum()

27

In [10]:
df[df['review'].str.isspace()]

Unnamed: 0,label,review
57,neg,
71,pos,
147,pos,
151,pos,
283,pos,
307,pos,
313,neg,
323,pos,
343,pos,
351,neg,


In [11]:
df[~df['review'].str.isspace()]

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [12]:
df = df[~df['review'].str.isspace()]

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1938 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1938 non-null   object
 1   review  1938 non-null   object
dtypes: object(2)
memory usage: 45.4+ KB


In [14]:
df['review'].str.isspace().sum()

0

### EDA on Bag of Words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words = 'english')

In [16]:
df[df['label']=='neg']['review']

0       how do films like mouse hunt get into theatres...
1       some talented actresses are blessed with a dem...
4       my first press screening of 1998 and already i...
5       to put it bluntly , ed wood would have been pr...
6       synopsis : melissa , a mentally-disturbed woma...
                              ...                        
1985    the real blonde ( r ) a woman's face , an arm ...
1986     * * * the following review contains spoilers ...
1987     " book " should have remained in shadows \r\n...
1991    all right , all right , we get the point : des...
1992    say , tell me if you've seen this before : a c...
Name: review, Length: 969, dtype: object

In [17]:
matrix=cv.fit_transform(df[df['label']=='neg']['review'])
cv.get_feature_names_out()

array(['00', '000', '007', ..., 'zwick', 'zwigoff', 'zzzzzzz'],
      dtype=object)

In [18]:
#cv.vocabulary_

In [19]:
#matrix.sum(axis =0).tolist()[0]

In [20]:
## Top 20 words in Negative Reviews

In [20]:
matrix=cv.fit_transform(df[df['label']=='neg']['review'])
freq = zip(cv.get_feature_names_out(),matrix.sum(axis =0).tolist()[0])
print('Top 20 words used for Negative Reviews')
sorted(freq,key = lambda x :-x[1])[:20]

Top 20 words used for Negative Reviews


[('film', 4063),
 ('movie', 3131),
 ('like', 1808),
 ('just', 1480),
 ('time', 1127),
 ('good', 1117),
 ('bad', 997),
 ('character', 926),
 ('story', 908),
 ('plot', 888),
 ('characters', 838),
 ('make', 813),
 ('really', 743),
 ('way', 734),
 ('little', 696),
 ('don', 683),
 ('does', 666),
 ('doesn', 648),
 ('action', 635),
 ('scene', 634)]

In [21]:
help(sorted)

Help on built-in function sorted in module builtins:

sorted(iterable, /, *, key=None, reverse=False)
    Return a new list containing all items from the iterable in ascending order.
    
    A custom key function can be supplied to customize the sort order, and the
    reverse flag can be set to request the result in descending order.



In [22]:
sorted(freq,key = lambda x : -x[1])

[]

In [23]:
df[df['label']=='pos']['review']

2       this has been an extraordinary year for austra...
3       according to hollywood movies made in last few...
11      with stars like sigourney weaver ( " alien " t...
16      i remember hearing about this film when it fir...
18      garry shandling makes his long overdue starrin...
                              ...                        
1995    i like movies with albert brooks , and i reall...
1996    it might surprise some to know that joel and e...
1997    the verdict : spine-chilling drama from horror...
1998    i want to correct what i wrote in a former ret...
1999    a couple of months ago , when i first download...
Name: review, Length: 969, dtype: object

In [24]:
matrix=cv.fit_transform(df[df['label']=='pos']['review'])
freq = zip(cv.get_feature_names_out(),matrix.sum(axis =0).tolist()[0])
print('Top 20 words used for Positive Revies')
sorted(freq,key = lambda x : -x[1])[:20]

Top 20 words used for Positive Revies


[('film', 5002),
 ('movie', 2389),
 ('like', 1721),
 ('just', 1273),
 ('story', 1199),
 ('good', 1193),
 ('time', 1175),
 ('character', 1037),
 ('life', 1032),
 ('characters', 957),
 ('way', 864),
 ('films', 851),
 ('does', 828),
 ('best', 788),
 ('people', 769),
 ('make', 764),
 ('little', 751),
 ('really', 731),
 ('man', 728),
 ('new', 702)]