In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy

In [2]:
metadata = pd.read_csv("data/MovieSummaries/movie.metadata.tsv", sep='\t', header=0,
                       names=['wiki_id', 'freebase_id', 'name', 'release_date','box_office_revenue', 'runtime', 'languages', 'countries', 'genres'])
metadata.head()

Unnamed: 0,wiki_id,freebase_id,name,release_date,box_office_revenue,runtime,languages,countries,genres
0,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
1,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
2,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
3,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
4,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen..."


### Create writer and director Dataframe

In [3]:
titles = pd.read_csv('data/title.akas.tsv.gz', compression='gzip', sep='\t', na_values="\\N", usecols=['titleId', 'title'])
titles = titles.rename(columns={'titleId' : 'tconst'})
titles.head()

Unnamed: 0,tconst,title
0,tt0000001,Карменсіта
1,tt0000001,Carmencita
2,tt0000001,Carmencita - spanyol tánc
3,tt0000001,Καρμενσίτα
4,tt0000001,Карменсита


In [4]:
## Inner join titles and metadata to only keep the movies that are in the original set
titles = pd.merge(titles, metadata.rename(columns={'name' : 'title'}).title, on='title', how='inner')
titles.head()

Unnamed: 0,tconst,title
0,tt0000001,Carmencita
1,tt0000001,Carmencita
2,tt0000001,Carmencita
3,tt0021748,Carmencita
4,tt0028162,Carmencita


In [5]:
crew = pd.read_csv('data/title.crew.tsv.gz', compression='gzip', sep='\t', na_values="\\N")
crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


In [6]:
## Inner join crew and titles on tconst to have the title name with the directors and writers
crew = pd.merge(crew, titles, on='tconst', how='inner')
crew.drop('tconst', axis=1, inplace=True) ## Drop useless column "tconst"
crew = crew.drop_duplicates() ## Drop duplicates i.e. titles that have multiple "tconst" values
crew.head()

Unnamed: 0,directors,writers,title
0,nm0005690,,Carmencita
3,nm0721526,,Pauvre Pierrot
5,nm0721526,,Un bon bock
7,nm0005690,,Blacksmith Scene
10,nm0005690,,Chinese Opium Den


In [7]:
## Free memory
del titles

In [8]:
# Create two data frame for directors and writers, and drop the row without directors and writers.
directors = crew[['directors', 'title']].dropna()
writers = crew[['title', 'writers']].dropna()

In [9]:
# Directors and writers are list, so we transform them into rows
directors['directors'] = directors['directors'].str.split(pat=',')
writers['writers'] = writers['writers'].str.split(pat=',')
directors = directors.explode('directors')
writers = writers.explode('writers')

In [10]:
names = pd.read_csv('data/name.basics.tsv.gz', compression='gzip', sep='\t', na_values="\\N")
names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0117057,tt0071877,tt0038355"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0049189,tt0057345,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0077975,tt0072562,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0083922,tt0060827,tt0050976,tt0050986"


In [11]:
directors = pd.merge(directors, names.rename(columns={'nconst' : 'directors'})[['directors', 'primaryName']],
                     on='directors', how='inner')
directors.drop('directors', axis=1, inplace=True)
directors = directors.drop_duplicates()
directors.head()

Unnamed: 0,title,primaryName
0,Carmencita,William K.L. Dickson
1,Blacksmith Scene,William K.L. Dickson
2,Chinese Opium Den,William K.L. Dickson
3,Corbett and Courtney Before the Kinetograph,William K.L. Dickson
4,Fred Ott's Sneeze,William K.L. Dickson


In [12]:
writers = pd.merge(writers, names.rename(columns={'nconst' : 'writers'})[['writers', 'primaryName']],
                   on='writers', how='inner')
writers.drop('writers', axis=1, inplace=True)
writers = writers.drop_duplicates()
writers.head()

Unnamed: 0,title,primaryName
0,Miss Jerry,Alexander Black
1,Awakening of Rip,Washington Irving
2,Rip Van Winkle,Washington Irving
6,Have You Got Any Castles?,Washington Irving
7,The Adventures of Ichabod and Mr. Toad,Washington Irving


In [13]:
## Free memory
del names

In [3]:
plots = pd.read_csv("data/MovieSummaries/plot_summaries.txt", sep="\t", header=0, names=['wiki_id', 'plot_summaries'])
plots.sample(10)

Unnamed: 0,wiki_id,plot_summaries
7777,7326436,Small-time gangster Ugo Piazza gets released ...
14881,14955031,Socialite Molly Lasch is released from prison ...
31453,1412856,Gino and Fiore are Italian racketeers who co...
10184,3214698,"Around the turn of the 20th century, British a..."
28823,12642729,The city girl Kate falls in love with farmer L...
23572,10566880,The title character is an innocent housekeeper...
36215,14436640,"Crystal Shackelford lures two strangers, soli..."
40770,7955522,The plot of the special differs significantly ...
2526,11014002,"Plummer plays Fitz Wynn, a truly talented but ..."
25944,4213599,Duke Frederick has usurped and deposed his ol...


In [4]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("merge_entities") # Merge entities like ['David' 'Bowie'] to ['David Bowie']

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [15]:
def tag_adj(text):
    doc = nlp(text)
    noun_adj_pairs = []
    for token in doc:
        ## Is the word a proper noun / noun / pronoun
        if token.pos_ in ('PROPN', 'NOUN', 'PRON'):
            for child in token.children:
                ## Is the child an adjectival modifier
                if child.dep_ == 'amod':
                    noun_adj_pairs.append((token, child))
    return noun_adj_pairs

In [16]:
tag_adj("There is a red card in the blue envelope and a beautiful girl.")

[(card, red), (envelope, blue), (girl, beautiful)]

In [8]:
tag_adj("She is beautiful.")

[]

In [9]:
def tag_verb(text):
    doc = nlp(text)
    verbs = []
    for possible_verb in doc:
        if possible_verb.pos_ == 'VERB':
            for possible_subject in possible_verb.children:
                if possible_subject.dep_ == 'nsubj':
                    verbs.append((possible_subject, possible_verb))
                    break
    return verbs

In [10]:
tag_verb(plots.loc[0].plot_summaries)

[(nation, consists),
 (district, provide),
 (tributes, fight),
 (who, gave),
 (she, starving),
 (He, warns),
 (who, train),
 (Peeta, reveals),
 (sponsors, provide),
 (she, discovers),
 (Peeta, meant),
 (he, said),
 (Games, begin),
 (Katniss, survives),
 (Peeta, forms),
 (They, find),
 (Rue, draws),
 (Katniss, drops),
 (Rue, cares),
 (she, recovers),
 (alliance, gathered),
 (Katniss, draw),
 (Cato, kills),
 (Katniss, runs),
 (she, hears),
 (She, finds),
 (Rue, trapped),
 (Marvel, throws),
 (she, dodges),
 (it, stab),
 (Katniss, shoots),
 (She, comforts),
 (she, gathers),
 (it, sparks),
 (Games, turning),
 (change, avoid),
 (tributes, win),
 (She, portrays),
 (announcer, proclaims),
 (thing, needs),
 (Peeta, begs),
 (Katniss, promises),
 (he, falls),
 (she, heads),
 (Clove, ambushes),
 (Thresh, kills),
 (He, spares),
 (medicine, works),
 (Foxface, dies),
 (she, stole),
 (Crane, changes),
 (They, kill),
 (they, encounter),
 (Katniss, wounds),
 (Katniss, shoots),
 (Peeta, tells),
 (she, gi

In [11]:
tag_adj(plots.loc[0].plot_summaries)

[(Capitol, wealthy),
 (districts, poorer),
 (rebellion, past),
 (Hunger Games, annual),
 (survivor, sole),
 (Reaping, first),
 (Everdeen, 12-year-old),
 (sister, older),
 (tribute, other),
 (mentor, drunk),
 (Haymitch Abernathy, past),
 (academies, special),
 (Games, televised),
 (supplies, tempting),
 (alliance, uneasy),
 (tree, nearby),
 (nest, poisonous),
 (Rue, dying),
 (lovers, crossed),
 (riots, further),
 (district, same),
 (wound, infected),
 (tribute, other),
 (mobile, Peeta),
 (creatures, like),
 (death, prolonged),
 (Games, 74th),
 (enemies, powerful)]

In [17]:
verb = pd.concat([plots.copy(), plots.plot_summaries.apply(tag_verb)], axis=1)
verb.head()

KeyboardInterrupt: 

In [None]:
adj = pd.concat([plots.copy(), plots.plot_summaries.apply(tag_adj)], axis=1)
adj.head()