In [44]:
# some basic imports
import numpy as np
import pandas as pd

In [46]:
# importing file and viewing as dataframe
path = 'Data/IMDB Dataset.csv'
df = pd.read_csv(path)

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [48]:
# getting to know shape of data
df.shape

(50000, 2)

In [50]:
# viewing some reviews to know how to clean preprocess
df.iloc[47542,0]

"Just saw 'The League of Gentlemen: Apocalypse' at a special screening in Manchester, with Mark Gatiss and Reece Shearsmith of the League in attendance.<br /><br />At the back was Peter Kay (who has a brief cameo in the film) affectionately heckling at the back during the Q & A session after the film.<br /><br />The film was complicated (in a good way) and very very funny. It follows Geoff Tipps, Hilary Briss and Herr Lipp as they try and save fictional Rosyton Vasey from the disinterest of their creators.<br /><br />The League play a wide range of their characters and themselves (or character based on themselves) and are ably supported by the cream of British character and comedy actors such as Bernard Hill, Victoria Wood and David Warner.<br /><br />Warner is a particular stand out reminding me of his smooth and cutting turn in 'Time Bandits'.<br /><br />The film swims in and out of various realities and allows some of the denizen's of Rosyton Vasey some space to grow beyond their us

#### How to clean and preprocess

- Remove HTML tags
- Remove punctuations
- Lowercase
- Stemm/Lemm
- Tokenize

## Cleaning and Preprocessing Text

In [52]:
# cleaning and preprocessing tools
from nlptools import clean, preprocess

In [54]:
# removing html tags
df['review'] = df['review'].apply(clean.remove_html_tags)

# removing urls
df['review'] = df['review'].apply(clean.remove_urls)

# lowercasing
df['review'] = df['review'].str.lower()

# removing punctuations
df['review'] = df['review'].apply(clean.remove_punctuations)

# removing stop words
df['review'] = df['review'].apply(clean.remove_stopwords)

In [55]:
# stemming
df['review'] = df['review'].apply(preprocess.stem_text)

# tokenization
df['word_vec'] = df['review'].apply(preprocess.tokenize_text)

In [56]:
df.head(10)

Unnamed: 0,review,sentiment,word_vec
0,one review mention watch 1 oz episod youll hoo...,positive,"[one, review, mention, watch, 1, oz, episod, y..."
1,wonder littl product film techniqu unassum old...,positive,"[wonder, littl, product, film, techniqu, unass..."
2,thought wonder way spend time hot summer weeke...,positive,"[thought, wonder, way, spend, time, hot, summe..."
3,basic there famili littl boy jake think there ...,negative,"[basic, there, famili, littl, boy, jake, think..."
4,petter mattei love time money visual stun film...,positive,"[petter, mattei, love, time, money, visual, st..."
5,probabl alltim favorit movi stori selfless sac...,positive,"[probabl, alltim, favorit, movi, stori, selfle..."
6,sure would like see resurrect date seahunt ser...,positive,"[sure, would, like, see, resurrect, date, seah..."
7,show amaz fresh innov idea 70 first air first ...,negative,"[show, amaz, fresh, innov, idea, 70, first, ai..."
8,encourag posit comment film look forward watch...,negative,"[encourag, posit, comment, film, look, forward..."
9,like origin gut wrench laughter like movi youn...,positive,"[like, origin, gut, wrench, laughter, like, mo..."
