# Preprocessing the data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import string
import nltk

from nltk.corpus import stopwords


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/antoine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load the dataset

We want to process the movie summaries to analyze them.

In [3]:
data_folder = '../../data/MovieSummaries'
df_plot = pd.read_csv(data_folder + '/plot_summaries.txt', sep="\t", header=None)
df_plot.columns = ["Movie ID", "Plot summary"]

df_plot[['Movie ID', 'Plot summary']]

Unnamed: 0,Movie ID,Plot summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


We are preprocessing the data of the movie summaries using the following steps:
1. Remove all punctuations
2. Put all characters of the text in lower case
3. Remove stop words
4. Use stemming or lemmatization

In [4]:
# Remove punctuation and transform all char in lower case
df_plot["Plot summary"] = df_plot["Plot summary"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df_plot[['Plot summary']]

Unnamed: 0,Plot summary
0,Shlykov a hardworking taxi driver and Lyosha a...
1,The nation of Panem consists of a wealthy Capi...
2,Poovalli Induchoodan is sentenced for six yea...
3,The Lemon Drop Kid a New York City swindler i...
4,Seventhday Adventist Church pastor Michael Cha...
...,...
42298,The story is about Reema a young Muslim schoo...
42299,In 1928 Hollywood director Leo Andreyev looks...
42300,American Luthier focuses on Randy Parsons’ tra...
42301,Abdur Rehman Khan a middleaged dry fruit sell...


In [5]:
# Remove stop words

# Use the nltk package to load english stop words
# If package not usable, create manually a list of stop words to remove
stop_words = set(stopwords.words('english'))
# print(stop_words)

df_plot["Plot summary"] = df_plot["Plot summary"].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df_plot[['Plot summary']]

Unnamed: 0,Plot summary
0,Shlykov hardworking taxi driver Lyosha saxopho...
1,nation Panem consists wealthy Capitol twelve p...
2,Poovalli Induchoodan sentenced six years priso...
3,Lemon Drop Kid New York City swindler illegall...
4,Seventhday Adventist Church pastor Michael Cha...
...,...
42298,story Reema young Muslim schoolgirl Malabar lo...
42299,1928 Hollywood director Leo Andreyev looks pho...
42300,American Luthier focuses Randy Parsons’ transf...
42301,Abdur Rehman Khan middleaged dry fruit seller ...


In [6]:
# Save the cleaned data
df_plot.to_csv(data_folder + '/plot_summaries_cleaned.csv', index=False)