In [1]:
import re
import nltk
import pandas as pd
from string import punctuation
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [11]:
pd.set_option('display.max_colwidth',50)
data = pd.read_csv("hp_script.csv")
data.head()

Unnamed: 0,ID_number,scene,character_name,dialogue
0,1,1,Albus Dumbledore,"I should have known that you would be here, Pr..."
1,2,1,Minerva McGonagall,"Good evening, Professor Dumbledore. Are the ru..."
2,3,1,Albus Dumbledore,"I'm afraid so, Professor. The good, and the bad."
3,4,1,Minerva McGonagall,And the boy?
4,5,1,Albus Dumbledore,Hagrid is bringing him.


In [12]:
data= data[['character_name','dialogue']]
data.head()

Unnamed: 0,character_name,dialogue
0,Albus Dumbledore,"I should have known that you would be here, Pr..."
1,Minerva McGonagall,"Good evening, Professor Dumbledore. Are the ru..."
2,Albus Dumbledore,"I'm afraid so, Professor. The good, and the bad."
3,Minerva McGonagall,And the boy?
4,Albus Dumbledore,Hagrid is bringing him.


In [13]:
print(f' Input data has {len(data)} rows, {len,(data.columns)} columns')

 Input data has 793 rows, (<built-in function len>, Index(['character_name', 'dialogue'], dtype='object')) columns


### Text Cleaning

In [16]:
def remove_punctuation(data):
    txt_nopunct = [c for c in data if c not in punctuation]
    return txt_nopunct

In [17]:
data['clean'] = data['dialogue'].apply(lambda x: remove_punctuation(x))
data.head(10)

Unnamed: 0,character_name,dialogue,clean
0,Albus Dumbledore,"I should have known that you would be here, Pr...","[I, , s, h, o, u, l, d, , h, a, v, e, , k, ..."
1,Minerva McGonagall,"Good evening, Professor Dumbledore. Are the ru...","[G, o, o, d, , e, v, e, n, i, n, g, , P, r, ..."
2,Albus Dumbledore,"I'm afraid so, Professor. The good, and the bad.","[I, m, , a, f, r, a, i, d, , s, o, , P, r, ..."
3,Minerva McGonagall,And the boy?,"[A, n, d, , t, h, e, , b, o, y]"
4,Albus Dumbledore,Hagrid is bringing him.,"[H, a, g, r, i, d, , i, s, , b, r, i, n, g, ..."
5,Minerva McGonagall,Do you think it wise to trust Hagrid with some...,"[D, o, , y, o, u, , t, h, i, n, k, , i, t, ..."
6,Albus Dumbledore,"Ah, Professor, I would trust Hagrid with my life.","[A, h, , P, r, o, f, e, s, s, o, r, , I, , ..."
7,Rubeus Hagrid,"Professor Dumbledore, Sir. Professor McGonagall.","[P, r, o, f, e, s, s, o, r, , D, u, m, b, l, ..."
8,Albus Dumbledore,"No problems, I trust, Hagrid?","[N, o, , p, r, o, b, l, e, m, s, , I, , t, ..."
9,Rubeus Hagrid,"No, Sir. Little tyke fell asleep just as we we...","[N, o, , S, i, r, , L, i, t, t, l, e, , t, ..."


In [18]:
def remove_punctuation(data):
    txt_nopunct = "".join([c for c in data if c not in punctuation])
    return txt_nopunct

In [19]:
data['clean'] = data['dialogue'].apply(lambda x: remove_punctuation(x))
data.head()

Unnamed: 0,character_name,dialogue,clean
0,Albus Dumbledore,"I should have known that you would be here, Pr...",I should have known that you would be here Pro...
1,Minerva McGonagall,"Good evening, Professor Dumbledore. Are the ru...",Good evening Professor Dumbledore Are the rumo...
2,Albus Dumbledore,"I'm afraid so, Professor. The good, and the bad.",Im afraid so Professor The good and the bad
3,Minerva McGonagall,And the boy?,And the boy
4,Albus Dumbledore,Hagrid is bringing him.,Hagrid is bringing him


In [20]:
def tokenize(txt):
    tokens = re.split('\W+', txt)
    return tokens

data['tokenize'] = data['clean'].apply(lambda x: tokenize(x.lower()))
data.head()

Unnamed: 0,character_name,dialogue,clean,tokenize
0,Albus Dumbledore,"I should have known that you would be here, Pr...",I should have known that you would be here Pro...,"[i, should, have, known, that, you, would, be,..."
1,Minerva McGonagall,"Good evening, Professor Dumbledore. Are the ru...",Good evening Professor Dumbledore Are the rumo...,"[good, evening, professor, dumbledore, are, th..."
2,Albus Dumbledore,"I'm afraid so, Professor. The good, and the bad.",Im afraid so Professor The good and the bad,"[im, afraid, so, professor, the, good, and, th..."
3,Minerva McGonagall,And the boy?,And the boy,"[and, the, boy]"
4,Albus Dumbledore,Hagrid is bringing him.,Hagrid is bringing him,"[hagrid, is, bringing, him]"


### Remove Stop Words

In [22]:
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(cleaned):
    txt_clean = [word for word in cleaned if word not in stopwords]
    return txt_clean

data['stop words'] = data['tokenize'].apply(lambda x: remove_stopwords(x))
data.tail()

Unnamed: 0,character_name,dialogue,clean,tokenize,stop words
788,Rubeus Hagrid,"Go on. On with you, on with you now. Oh, now l...",Go on On with you on with you now Oh now liste...,"[go, on, on, with, you, on, with, you, now, oh...","[go, oh, listen, harry, dope, cousin, dudley, ..."
789,Harry Potter,"But Hagrid, we're not allowed to do magic away...",But Hagrid were not allowed to do magic away f...,"[but, hagrid, were, not, allowed, to, do, magi...","[hagrid, allowed, magic, away, hogwarts, know]"
790,Rubeus Hagrid,"I do, but your cousin don't, do he? Eh? Hmmhmm",I do but your cousin dont do he Eh Hmmhmm,"[i, do, but, your, cousin, dont, do, he, eh, h...","[cousin, dont, eh, hmmhmm]"
791,Hermione Granger,"Feels strange to be going home, doesn't it?",Feels strange to be going home doesnt it,"[feels, strange, to, be, going, home, doesnt, it]","[feels, strange, going, home, doesnt]"
792,Harry Potter,"I'm not going home, not really.",Im not going home not really,"[im, not, going, home, not, really]","[im, going, home, really]"


### Porter Stemming 

In [23]:
ps = PorterStemmer()

def stemming(tokenized):
    text = [ps.stem(word) for word in tokenized]
    return text

In [26]:
data['stemmed'] = data['stop words'].apply(lambda x: stemming(x))
data.tail(10)

Unnamed: 0,character_name,dialogue,clean,tokenize,stop words,stemmed
783,Rubeus Hagrid,"Come on now, hurry up, you'll be late! Train's...",Come on now hurry up youll be late Trains leav...,"[come, on, now, hurry, up, youll, be, late, tr...","[come, hurry, youll, late, trains, leaving, go...","[come, hurri, youll, late, train, leav, go, co..."
784,Hermione Granger,"Come on, Harry.",Come on Harry,"[come, on, harry]","[come, harry]","[come, harri]"
785,Harry Potter,One minute.,One minute,"[one, minute]","[one, minute]","[one, minut]"
786,Rubeus Hagrid,Thought you were leaving without sayin' goodby...,Thought you were leaving without sayin goodbye...,"[thought, you, were, leaving, without, sayin, ...","[thought, leaving, without, sayin, goodbye, ya, ]","[thought, leav, without, sayin, goodby, ya, ]"
787,Harry Potter,"Thanks, Hagrid.",Thanks Hagrid,"[thanks, hagrid]","[thanks, hagrid]","[thank, hagrid]"
788,Rubeus Hagrid,"Go on. On with you, on with you now. Oh, now l...",Go on On with you on with you now Oh now liste...,"[go, on, on, with, you, on, with, you, now, oh...","[go, oh, listen, harry, dope, cousin, dudley, ...","[go, oh, listen, harri, dope, cousin, dudley, ..."
789,Harry Potter,"But Hagrid, we're not allowed to do magic away...",But Hagrid were not allowed to do magic away f...,"[but, hagrid, were, not, allowed, to, do, magi...","[hagrid, allowed, magic, away, hogwarts, know]","[hagrid, allow, magic, away, hogwart, know]"
790,Rubeus Hagrid,"I do, but your cousin don't, do he? Eh? Hmmhmm",I do but your cousin dont do he Eh Hmmhmm,"[i, do, but, your, cousin, dont, do, he, eh, h...","[cousin, dont, eh, hmmhmm]","[cousin, dont, eh, hmmhmm]"
791,Hermione Granger,"Feels strange to be going home, doesn't it?",Feels strange to be going home doesnt it,"[feels, strange, to, be, going, home, doesnt, it]","[feels, strange, going, home, doesnt]","[feel, strang, go, home, doesnt]"
792,Harry Potter,"I'm not going home, not really.",Im not going home not really,"[im, not, going, home, not, really]","[im, going, home, really]","[im, go, home, realli]"


In [37]:
words = data['stop words'][1:7]
words

1    [good, evening, professor, dumbledore, rumours...
2                   [im, afraid, professor, good, bad]
3                                                [boy]
4                                   [hagrid, bringing]
5    [think, wise, trust, hagrid, something, import...
6          [ah, professor, would, trust, hagrid, life]
Name: stop words, dtype: object

### Stemming and Lemmatization

In [40]:
lemmatizer = WordNetLemmatizer()

for y in words:
    for w in y: 
        print("\n\n"+w, "\nSTEMMING :", ps.stem(w),"\n","LEMMATIZATION :", lemmatizer.lemmatize(w,pos = "n"))



good 
STEMMING : good 
 LEMMATIZATION : good


evening 
STEMMING : even 
 LEMMATIZATION : evening


professor 
STEMMING : professor 
 LEMMATIZATION : professor


dumbledore 
STEMMING : dumbledor 
 LEMMATIZATION : dumbledore


rumours 
STEMMING : rumour 
 LEMMATIZATION : rumour


true 
STEMMING : true 
 LEMMATIZATION : true


albus 
STEMMING : albu 
 LEMMATIZATION : albus


im 
STEMMING : im 
 LEMMATIZATION : im


afraid 
STEMMING : afraid 
 LEMMATIZATION : afraid


professor 
STEMMING : professor 
 LEMMATIZATION : professor


good 
STEMMING : good 
 LEMMATIZATION : good


bad 
STEMMING : bad 
 LEMMATIZATION : bad


boy 
STEMMING : boy 
 LEMMATIZATION : boy


hagrid 
STEMMING : hagrid 
 LEMMATIZATION : hagrid


bringing 
STEMMING : bring 
 LEMMATIZATION : bringing


think 
STEMMING : think 
 LEMMATIZATION : think


wise 
STEMMING : wise 
 LEMMATIZATION : wise


trust 
STEMMING : trust 
 LEMMATIZATION : trust


hagrid 
STEMMING : hagrid 
 LEMMATIZATION : hagrid


something 
STEMMING : 

In [43]:
#without pos
for y in words:
    for w in y: 
         print("\n\n"+w, "\nSTEMMING :", ps.stem(w),"\n","LEMMATIZATION :", lemmatizer.lemmatize(w))



good 
STEMMING : good 
 LEMMATIZATION : good


evening 
STEMMING : even 
 LEMMATIZATION : evening


professor 
STEMMING : professor 
 LEMMATIZATION : professor


dumbledore 
STEMMING : dumbledor 
 LEMMATIZATION : dumbledore


rumours 
STEMMING : rumour 
 LEMMATIZATION : rumour


true 
STEMMING : true 
 LEMMATIZATION : true


albus 
STEMMING : albu 
 LEMMATIZATION : albus


im 
STEMMING : im 
 LEMMATIZATION : im


afraid 
STEMMING : afraid 
 LEMMATIZATION : afraid


professor 
STEMMING : professor 
 LEMMATIZATION : professor


good 
STEMMING : good 
 LEMMATIZATION : good


bad 
STEMMING : bad 
 LEMMATIZATION : bad


boy 
STEMMING : boy 
 LEMMATIZATION : boy


hagrid 
STEMMING : hagrid 
 LEMMATIZATION : hagrid


bringing 
STEMMING : bring 
 LEMMATIZATION : bringing


think 
STEMMING : think 
 LEMMATIZATION : think


wise 
STEMMING : wise 
 LEMMATIZATION : wise


trust 
STEMMING : trust 
 LEMMATIZATION : trust


hagrid 
STEMMING : hagrid 
 LEMMATIZATION : hagrid


something 
STEMMING : 