## Importing libraries

In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
resources = ['stopwords','wordnet','punkt','averaged_perceptron_tagger']
nltk.download(resources)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
## Spacy pre processing
import spacy
import regex as re
import string

## Loading data

In [6]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movies.csv')
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1..."


In [8]:
df.shape

(100, 5)

## Data Cleaning and Pre-processing

### Using Spacy

In [14]:
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#function for data cleaning and processing
#This can be further enhanced by adding / removing reg-exps as desired.

def spacy_tokenizer(sentence):

    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)

    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)

    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)

    #creating token object
    tokens = spacy_nlp(sentence)

    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]

    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]

    #return tokens
    return tokens

In [15]:
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1..."


In [18]:
df['wiki_plot_tokenized'] = df['wiki_plot'].map(lambda x: spacy_tokenizer(x))


In [19]:
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot,wiki_plot_tokenized
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t...","[day, daughter, wedding, vito, corleone, hear,..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker...","[banker, andy, dufresne, convict, murder, wife..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...,"[germans, polish, jews, kraków, ghetto, world,..."
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat...","[brief, scene, age, overweight, italian, ameri..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1...","[early, december, american, expatriate, rick, ..."


In [21]:
df['wiki_token'] = df['wiki_plot'].map(lambda x: spacy_tokenizer(x))

In [22]:
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot,wiki_plot_tokenized,wiki_token
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t...","[day, daughter, wedding, vito, corleone, hear,...","[day, daughter, wedding, vito, corleone, hear,..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker...","[banker, andy, dufresne, convict, murder, wife...","[banker, andy, dufresne, convict, murder, wife..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...,"[germans, polish, jews, kraków, ghetto, world,...","[germans, polish, jews, kraków, ghetto, world,..."
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat...","[brief, scene, age, overweight, italian, ameri...","[brief, scene, age, overweight, italian, ameri..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1...","[early, december, american, expatriate, rick, ...","[early, december, american, expatriate, rick, ..."


In [24]:
data = df.wiki_token
data = pd.DataFrame(data)

In [25]:
data.head()

Unnamed: 0,wiki_token
0,"[day, daughter, wedding, vito, corleone, hear,..."
1,"[banker, andy, dufresne, convict, murder, wife..."
2,"[germans, polish, jews, kraków, ghetto, world,..."
3,"[brief, scene, age, overweight, italian, ameri..."
4,"[early, december, american, expatriate, rick, ..."


### Using NLTK


In [26]:
data['wiki_plot'] = df.wiki_plot

In [27]:
data.head()

Unnamed: 0,wiki_token,wiki_plot
0,"[day, daughter, wedding, vito, corleone, hear,...","On the day of his only daughter's wedding, Vit..."
1,"[banker, andy, dufresne, convict, murder, wife...","In 1947, banker Andy Dufresne is convicted of ..."
2,"[germans, polish, jews, kraków, ghetto, world,...","In 1939, the Germans move Polish Jews into the..."
3,"[brief, scene, age, overweight, italian, ameri...","In a brief scene in 1964, an aging, overweight..."
4,"[early, december, american, expatriate, rick, ...",It is early December 1941. American expatriate...


In [32]:
def nltk_lemmatize(sentence):
    sentence = sentence.lower()


0

In [46]:
sentence = data.wiki_plot[1]
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')



from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn

lemmatizer = WordNetLemmatizer()

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J']: wn.ADJ
tag_map['V']: wn.VERB
tag_map['R']: wn.ADV



In [54]:
def lemmatize_sent(sentence):
    sentence = sentence.lower()
    clean_sentence = re.sub('[^a-zA-Z\s]','', sentence)
    words = clean_sentence.split()
    fil_words = [ word for word in words if word not in stop_words]
    new_sentence = ' '.join(fil_words)
    tokens = word_tokenize(new_sentence)
    tagged_tokens = pos_tag(tokens)
    final_words = []
    for i in range(len(tokens)):
        token, tag = tagged_tokens[i]
        word_final = lemmatizer.lemmatize(token, tag_map[tag[0]])
        final_words.append(word_final)

    return final_words






In [56]:
sentence

'in 1947, banker andy dufresne is convicted of murdering his wife and her lover and sentenced to two consecutive life sentences at the fictional shawshank state penitentiary in the state of maine. andy befriends contraband smuggler ellis "red" redding, an inmate serving a life sentence. red procures a rock hammer and later a large poster of rita hayworth for andy. working in the prison laundry, andy is regularly assaulted by the "bull queer" gang "the sisters" and their leader, bogs.  in 1947, banker andy dufresne is convicted of murdering his wife and her lover and sentenced to two consecutive life sentences at the fictional shawshank state penitentiary in the state of maine. andy befriends   contraband  contraband   smuggler ellis "red" redding, an inmate serving a life sentence. red procures a   rock hammer  rock hammer   and later a large poster of   rita hayworth  rita hayworth   for andy. working in the prison laundry, andy is regularly assaulted by the "  bull queer  bull queer 

In [55]:
lemmatize_sent(sentence)

['banker',
 'andy',
 'dufresne',
 'convicted',
 'murdering',
 'wife',
 'lover',
 'sentenced',
 'two',
 'consecutive',
 'life',
 'sentence',
 'fictional',
 'shawshank',
 'state',
 'penitentiary',
 'state',
 'maine',
 'andy',
 'befriends',
 'contraband',
 'smuggler',
 'elli',
 'red',
 'redding',
 'inmate',
 'serving',
 'life',
 'sentence',
 'red',
 'procures',
 'rock',
 'hammer',
 'later',
 'large',
 'poster',
 'rita',
 'hayworth',
 'andy',
 'working',
 'prison',
 'laundry',
 'andy',
 'regularly',
 'assaulted',
 'bull',
 'queer',
 'gang',
 'sister',
 'leader',
 'bog',
 'banker',
 'andy',
 'dufresne',
 'convicted',
 'murdering',
 'wife',
 'lover',
 'sentenced',
 'two',
 'consecutive',
 'life',
 'sentence',
 'fictional',
 'shawshank',
 'state',
 'penitentiary',
 'state',
 'maine',
 'andy',
 'befriends',
 'contraband',
 'contraband',
 'smuggler',
 'elli',
 'red',
 'redding',
 'inmate',
 'serving',
 'life',
 'sentence',
 'red',
 'procures',
 'rock',
 'hammer',
 'rock',
 'hammer',
 'later',
 

SyntaxError: ignored

In [42]:
clean_sentence

'in  banker andy dufresne is convicted of murdering his wife and her lover and sentenced to two consecutive life sentences at the fictional shawshank state penitentiary in the state of maine andy befriends contraband smuggler ellis red redding an inmate serving a life sentence red procures a rock hammer and later a large poster of rita hayworth for andy working in the prison laundry andy is regularly assaulted by the bull queer gang the sisters and their leader bogs  in  banker andy dufresne is convicted of murdering his wife and her lover and sentenced to two consecutive life sentences at the fictional shawshank state penitentiary in the state of maine andy befriends   contraband  contraband   smuggler ellis red redding an inmate serving a life sentence red procures a   rock hammer  rock hammer   and later a large poster of   rita hayworth  rita hayworth   for andy working in the prison laundry andy is regularly assaulted by the   bull queer  bull queer   gang the sisters and their le