In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Top 5 data records

In [4]:
reviews = pd.read_csv("./IMDB_Review_Dataset.csv")
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Bottom 5 Data Records

In [5]:
reviews.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [10]:
newDf = reviews.head(10)

newDf.to_csv('temp.csv')

In [167]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [168]:
# Check for any null fields
reviews.isnull().any()

review       False
sentiment    False
dtype: bool

In [169]:
reviews['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [170]:
reviews['bin_sentiment'] = (reviews['sentiment'] == 'positive').astype(int)
reviews['bin_sentiment']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: bin_sentiment, Length: 50000, dtype: int64

In [171]:
reviews.columns

Index(['review', 'sentiment', 'bin_sentiment'], dtype='object')

In [172]:
import re

def clean_review(s: str):
    lower = s.lower() # lowercase
    res = re.sub(r'[^a-zA-Z0-9\s]', '', lower) # remove specials chars - chars other than A-Z, a-z and 0-9
    return res

In [173]:
reviews['cleaned_reviews'] = reviews['review'].apply(clean_review)
reviews

Unnamed: 0,review,sentiment,bin_sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,positive,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,1,a wonderful little production br br the filmin...
2,I thought this was a wonderful way to spend ti...,positive,1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,0,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter matteis love in the time of money is a ...
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,1,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,0,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,negative,0,im going to have to disagree with the previous...


In [174]:
# random samples
reviews.sample(2)

Unnamed: 0,review,sentiment,bin_sentiment,cleaned_reviews
24581,I saw The Big Bad Swim at the 2006 Temecula fi...,positive,1,i saw the big bad swim at the 2006 temecula fi...
15237,What the hell is in your minds ? This film suc...,negative,0,what the hell is in your minds this film suck...


In [175]:
reviews['tokens'] = reviews['cleaned_reviews'].apply(lambda x: x.split(" "))
reviews

Unnamed: 0,review,sentiment,bin_sentiment,cleaned_reviews,tokens
0,One of the other reviewers has mentioned that ...,positive,1,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,1,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the..."
2,I thought this was a wonderful way to spend ti...,positive,1,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,0,basically theres a family where a little boy j...,"[basically, theres, a, family, where, a, littl..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon..."
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,1,i thought this movie did a down right good job...,"[i, thought, this, movie, did, a, down, right,..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,0,i am a catholic taught in parochial elementary...,"[i, am, a, catholic, taught, in, parochial, el..."
49998,I'm going to have to disagree with the previou...,negative,0,im going to have to disagree with the previous...,"[im, going, to, have, to, disagree, with, the,..."


In [176]:
reviews['no_of_tokens'] = reviews['tokens'].apply(lambda x: len(x))
reviews

Unnamed: 0,review,sentiment,bin_sentiment,cleaned_reviews,tokens,no_of_tokens
0,One of the other reviewers has mentioned that ...,positive,1,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...",307
1,A wonderful little production. <br /><br />The...,positive,1,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the...",162
2,I thought this was a wonderful way to spend ti...,positive,1,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...",166
3,Basically there's a family where a little boy ...,negative,0,basically theres a family where a little boy j...,"[basically, theres, a, family, where, a, littl...",138
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon...",230
...,...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,1,i thought this movie did a down right good job...,"[i, thought, this, movie, did, a, down, right,...",194
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti...",112
49997,I am a Catholic taught in parochial elementary...,negative,0,i am a catholic taught in parochial elementary...,"[i, am, a, catholic, taught, in, parochial, el...",230
49998,I'm going to have to disagree with the previou...,negative,0,im going to have to disagree with the previous...,"[im, going, to, have, to, disagree, with, the,...",212
