In [1]:
import numpy as np
import pandas as pd
import requests

In [2]:
data_path = "./data/"

# Analysis of characters dataset

In [3]:
#load characters data
chars = pd.read_csv(data_path + 'lotr_characters.csv')
chars.head()

Unnamed: 0,birth,death,gender,hair,height,name,race,realm,spouse
0,,,Female,,,Adanel,Men,,Belemir
1,TA 2978,"February 26 ,3019",Male,Dark (book) Light brown (movie),,Boromir,Men,,
2,,"March ,3019",Male,,,Lagduf,Orcs,,
3,TA 280,TA 515,Male,,,Tarcil,Men,Arnor,Unnamed wife
4,,,Male,,,Fire-drake of Gondolin,Dragon,,


**Data Cleaning**

In [4]:
#map the races
race_mappings = {
    "Dragons" : "Dragon",
    "Dwarven" : "Dwarf",
    "Dwarves" : "Dwarf",
    "Eagles" : "Eagle",
    "Elves" : "Elf",
    "Hobbits" : "Hobbit",
    "Orcs" : "Orc",
    "Balrogs" : "Balrog"
}
chars['race'].replace(race_mappings, regex = True, inplace = True)
chars.head()

Unnamed: 0,birth,death,gender,hair,height,name,race,realm,spouse
0,,,Female,,,Adanel,Men,,Belemir
1,TA 2978,"February 26 ,3019",Male,Dark (book) Light brown (movie),,Boromir,Men,,
2,,"March ,3019",Male,,,Lagduf,Orc,,
3,TA 280,TA 515,Male,,,Tarcil,Men,Arnor,Unnamed wife
4,,,Male,,,Fire-drake of Gondolin,Dragon,,


In [5]:
#get some stats about races with grouping
race_counts = chars.groupby('race').size().reset_index(name='size')
race_counts.sort_values('size', ascending = False).head(5)

Unnamed: 0,race,size
24,Men,388
19,Hobbit,142
8,Elf,106
6,Dwarf,44
0,Ainur,24


In [6]:
#map the gender and get some stats
gender_mappings = {
    "Males" : "Male",
    "male" : "Male",
    "Most likely male" : "Male"
}
chars['gender'].replace(gender_mappings, inplace = True)

gender_counts = chars.groupby('gender').size().reset_index(name='size')
gender_counts.sort_values('size', ascending = False)

Unnamed: 0,gender,size
1,Male,633
0,Female,135


In [7]:
#same for hair color
hair_counts = chars.groupby('hair').size().reset_index(name='size')
hair_counts.sort_values('size', ascending = False).head()

Unnamed: 0,hair,size
26,Dark,29
45,Golden,21
5,Black,12
16,Brown,6
60,,5


In [8]:
#and realm...
realm_counts = chars.groupby('realm').size().reset_index(name='size')
realm_counts.sort_values('size', ascending = False).head()

Unnamed: 0,realm,size
22,Gondor,33
47,Númenor,28
56,Rohan,20
7,Arthedain,9
4,Arnor,7


*Date for birth and death are not categorical. In order to get some insight, we are categorizing them based on the era*

In [9]:
timeline_mappings = {
    "Third Age" : "TA",
    "Second Age" : "SA",
    "First Age" : "FA",
    "Years of the Trees" : "YT",
    "Fourth Age" : "FO"
}
def birth_death(d, cname):
    d[cname].replace(timeline_mappings, regex = True, inplace = True)
    d.loc[d[cname].str.startswith("Late", na=False), cname] = d[cname].str[-2:]
    d.loc[d[cname].str.startswith("Mid", na=False), cname] = d[cname].str[-2:]
    d.loc[d[cname].str.startswith("Early", na=False), cname] = d[cname].str[-2:]
    d.loc[d[cname].str.startswith("Perhaps", na=False), cname] = d[cname].str.replace("Perhaps ,", "")
    d.loc[d[cname].str.contains("Arda", na=False), cname] = "BA"
    era = cname + '_era'
    d[era] = d[cname].str[:2]
    d.loc[d[era] == 'Un', era] = 'Unknown'
    if cname == 'death':
        d.loc[d[era] == 'Im', era] = 'Immortal'
    return d

In [10]:
#grouping data based on birth and death era to get some insight about lifespan
chars = birth_death(chars, 'birth')
chars = birth_death(chars, 'death')
birth_era_counts = chars.groupby(['birth_era', 'death_era']).size().reset_index(name='size')
birth_era_counts.sort_values('size', ascending = False).head()

Unnamed: 0,birth_era,death_era,size
51,TA,TA,194
19,FA,FA,34
35,SA,SA,31
75,ge,FA,26
46,TA,FO,22


**Playing & showing some facts from data**

Battle of the Pelennor Fields is at March 15, 3019. Let's see who has died in this battle.

In [11]:
chars[(chars.death.str.contains('3019', na=False)) & \
      (chars.death.str.contains('March', na=False)) & \
      (chars.death.str.contains('15', na=False))][['name', 'birth', 'death']]

Unnamed: 0,name,birth,death
59,Fastred (Pelennor Fields),ge,"March 15 ,3019"
164,Horn,"Possibly late ,Third Age","March 15 ,TA 3019"
222,Dúnhere,ge,"March 15 ,3019"
226,Déorwine,ge,"March 15 ,3019"
260,Hirluin,ge,"March 15 ,3019"
266,Herubrand,ge,"March 15 ,3019"
269,Herefara,ge,"March 15 ,3019"
277,Harding,ge,"March 15 ,3019"
291,Halbarad,TA 2916,"March 15 ,3019"
314,Denethor II,TA 2930,"March 15 ,3019"


Getting information about some key characters

In [12]:
chars[chars.name == 'Gandalf']

Unnamed: 0,birth,death,gender,hair,height,name,race,realm,spouse,birth_era,death_era
667,BA,"January 253019 ,Battle of the Peak immortal",Male,"Grey, later white",,Gandalf,Maiar,,,BA,Ja


In [13]:
chars[chars.name == 'Legolas']

Unnamed: 0,birth,death,gender,hair,height,name,race,realm,spouse,birth_era,death_era
661,,"Still alive, departed to ,Aman ,FO 120",Male,"Uncertain (book), Blonde (films)",,Legolas,Elf,,,,St


In [14]:
chars[chars.name == 'Gollum']

Unnamed: 0,birth,death,gender,hair,height,name,race,realm,spouse,birth_era,death_era
473,TA 2430,"March 25 ,3019",Male,,,Gollum,Hobbit,,,TA,Ma


In [15]:
chars[chars.name == 'Aragorn II Elessar']

Unnamed: 0,birth,death,gender,hair,height,name,race,realm,spouse,birth_era,death_era
873,"March 1 ,2931",FO 120,Male,Dark,"198cm (6'6"")",Aragorn II Elessar,Men,"Reunited Kingdom,Arnor,Gondor",Arwen,Ma,FO


# Analysis of script dataset

In [94]:
script = pd.read_csv(data_path + "/lotr_scripts.csv")
script.head()

Unnamed: 0.1,Unnamed: 0,char,dialog,movie
0,0,DEAGOL,"Oh Smeagol Ive got one! , Ive got a fish Smeag...",The Return of the King
1,1,SMEAGOL,"Pull it in! Go on, go on, go on, pull it in!",The Return of the King
2,2,DEAGOL,Arrghh!,The Return of the King
3,3,SMEAGOL,Deagol!,The Return of the King
4,4,SMEAGOL,Deagol!,The Return of the King


This data seems cleaner than previous data we analyze. The only can be done to drop unnecessary index column.

In [95]:
script.drop('Unnamed: 0', axis = 1, inplace = True)
script.head()

Unnamed: 0,char,dialog,movie
0,DEAGOL,"Oh Smeagol Ive got one! , Ive got a fish Smeag...",The Return of the King
1,SMEAGOL,"Pull it in! Go on, go on, go on, pull it in!",The Return of the King
2,DEAGOL,Arrghh!,The Return of the King
3,SMEAGOL,Deagol!,The Return of the King
4,SMEAGOL,Deagol!,The Return of the King


First, we groupby data based on character name, and see how many quotes they have in the data.

error: missing ), unterminated subpattern at position 0

In [87]:
#remove additional blanks
script.char = script.char.str.strip()
# there are some dialogs where char is mentioned as character + VOICE OVER(or VOICEOVER)
script.char = script.char.str.replace("VOICE", "")
script.char = script.char.str.replace("OVER", "").str.strip()

char_count = script.groupby('char').size().reset_index(name='size')
char_count.sort_values('size', ascending = False).head(10)

Unnamed: 0,char,size
25,FRODO,229
72,SAM,218
30,GANDALF,215
1,ARAGORN,187
65,PIPPIN,163
54,MERRY,137
36,GOLLUM,134
35,GIMLI,116
87,THEODEN,110
21,FARAMIR,65


After getting the stats about the characters, for second step, we want to get, how many movies are there in data, and
analyze how many quotes each movie has.

In [88]:
script.movie = script.movie.str.strip()
movie_count = script.groupby('movie').size().reset_index(name='size')
movie_count.sort_values('size', ascending = False).head()

Unnamed: 0,movie,size
2,The Two Towers,1010
1,The Return of the King,873
0,The Fellowship of the Ring,507


lets try to find the most talkative characters in all films

In [89]:
movie_names = movie_count['movie'].values

words_count = script
words_count['word_count'] = words_count.dialog.str.split(' ').str.len()
words_count = words_count.groupby(['char','movie']).sum().reset_index()
words_count.sort_values('word_count', ascending = False).head()

Unnamed: 0,char,movie,word_count
49,GANDALF,The Return of the King,1991.0
10,BILBO,The Fellowship of the Ring,1491.0
112,SAM,The Return of the King,1440.0
48,GANDALF,The Fellowship of the Ring,1426.0
113,SAM,The Two Towers,1323.0


now lets try to find most talkative characters in every film

In [90]:
for movie_name in movie_names:
    movie_characters = words_count.loc[words_count['movie'] == movie_name]
    display(movie_characters.sort_values('word_count', ascending = False).head())

Unnamed: 0,char,movie,word_count
10,BILBO,The Fellowship of the Ring,1491.0
48,GANDALF,The Fellowship of the Ring,1426.0
42,GALADRIEL,The Fellowship of the Ring,662.0
38,FRODO,The Fellowship of the Ring,658.0
12,BOROMIR,The Fellowship of the Ring,457.0


Unnamed: 0,char,movie,word_count
49,GANDALF,The Return of the King,1991.0
112,SAM,The Return of the King,1440.0
39,FRODO,The Return of the King,1032.0
102,PIPPIN,The Return of the King,848.0
2,ARAGORN,The Return of the King,818.0


Unnamed: 0,char,movie,word_count
113,SAM,The Two Towers,1323.0
50,GANDALF,The Two Towers,1024.0
60,GOLLUM,The Two Towers,935.0
134,THEODEN,The Two Towers,930.0
3,ARAGORN,The Two Towers,881.0


Now let's discover some stats about a specific movie. Let's get main characters of first movie - The Two Towers with regards to number of dialogues they have.

In [20]:
two_towers = script[script.movie == 'The Two Towers']
two_towers_count = two_towers.groupby('char').size().reset_index(name='size')
two_towers_count.sort_values('size', ascending = False).head()

Unnamed: 0,char,size
0,ARAGORN,99
35,SAM,89
12,FRODO,84
17,GOLLUM,78
41,THEODEN,64


It would also be interesting to know which words main characters used most. So, in order to that, we retrieve all the dialogues the character has, and split each dialogue to words and remove stopwords.

Additionally, we add an option to find that which other characters the character mention or refer most. Basically, for each word we check that this word is inside the character list. 

In [27]:
with open(data_path + '/stopwords.txt', 'r') as f:
    stopwords = f.read().split(' ')

def most_used_words(char_name, movie_name, n_words, only_characters = False):
    char_name = char_name.upper()
    m = script[script.movie == movie_name]
    w = m[m.char == char_name].dialog.str.strip().str.lower().str.replace(r'([^\s\w]|_)+', '').str.split().values
    d = {}
    for quote in w:
        for word in quote:
            if word not in stopwords and (only_characters == False or word in m.char.str.lower().values):
                if word in d:
                    d[word] += 1
                else:
                    d[word] = 1
    cooccurences = {k: v for k, v in sorted(d.items(), key=lambda item: -item[1])}
    return list(cooccurences.items())[:n_words]

Let's play with data to see what one of the main characters of each movie speak most.

In [28]:
print('Gollum in "The Two Towers"')
print("1. Most used words: \n", most_used_words('gollum', 'The Two Towers', 4))
print("2. Most mentioned character names: \n", most_used_words('gollum', 'The Two Towers', 4, only_characters=True))

Gollum in "The Two Towers"
1. Most used words: 
 [('master', 13), ('precious', 9), ('kill', 6), ('gollum', 6)]
2. Most mentioned character names: 
 [('gollum', 6), ('smeagol', 5)]


So we can discover famous words of Gollum such as 'master' and 'precious' in data  :D 

In [29]:
print('Gandalf in "The Return of the King"')
print("1. Most used words: \n", most_used_words('gandalf', 'The Return of the King', 5))
print("2. Most mentioned character names: \n", most_used_words('gandalf', 'The Return of the King', 5, only_characters=True))

Gandalf in "The Return of the King"
1. Most used words: 
 [('city', 10), ('frodo', 8), ('king', 7), ('peregrin', 7), ('gondor', 7)]
2. Most mentioned character names: 
 [('frodo', 8), ('saruman', 5), ('sauron', 5), ('faramir', 5), ('denethor', 4)]


In [30]:
print('Frodo in "The Fellowship of the Ring"')
print("1. Most used words: \n", most_used_words('frodo', 'The Fellowship of the Ring', 3))
print("2. Most mentioned character names: \n", most_used_words('frodo', 'The Fellowship of the Ring', 3, only_characters=True))

Frodo in "The Fellowship of the Ring"
1. Most used words: 
 [('sam', 7), ('bilbo', 6), ('gandalf', 5)]
2. Most mentioned character names: 
 [('sam', 7), ('bilbo', 6), ('gandalf', 5)]


So, it seems Frodo loves Sam, Bilbo, and Gandalf so much.