# Imports and folder paths

In [106]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import json

PATH_FOLDER = "MovieSummaries/"

# TRAIN DATA

## 1) Character Metadata

Metadata for 450,669 characters aligned to the movies above, extracted from the Noverber 4, 2012 dump of Freebase.  

Tab-separated; columns:
1. Wikipedia movie ID 
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

---



In [107]:
character_file_name = os.path.join(PATH_FOLDER, 'character.metadata.csv')
character_metadata = pd.read_csv(character_file_name,  sep="\t")

character_metadata.columns = [[ "Wikipedia movie ID",
                                "Freebase movie ID", 
                                "Movie release date", 
                                "Character name", 
                                "Actor date of birth", 
                                "Actor gender", 
                                "Actor height (in meters)", 
                                "Actor ethnicity (Freebase ID)",
                                "Actor name", 
                                "Actor age at movie release", 
                                "Freebase character/actor map ID", 
                                "Freebase character ID",
                                "Freebase actor ID" ]]

character_metadata.sample(5)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
442740,23856720,/m/06_ttwc,2009,,,,,,Yusuf Aulia,,/m/0gc61gz,,/m/0gbygr6
191574,20934648,/m/05b5g_j,2008,,1986-11-09,F,,,Paula Kalenberg,21.0,/m/04p6gv1,,/m/04p6gv4
170472,581185,/m/02s56l,2001-12-14,,,F,,,Sushma Seth,,/m/03jpljy,,/m/02q98x2
97295,29304785,/m/0dryrq_,2011-09-16,,1971-05-05,F,1.7,,Saima,40.0,/m/0gvyf5l,,/m/084xvk
352347,503573,/m/02j69w,1993-09-10,Benny O'Donnell,1975-03-22,M,1.854,/m/041rx,Cole Hauser,18.0,/m/0jx12z,/m/051sncv,/m/0786bc


## 2) Movie Metadata

Metadata for 81,741 movies, extracted from the Noverber 4, 2012 dump of Freebase. 

Tab-separated; columns:
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)





In [108]:
movie_file_name = os.path.join(PATH_FOLDER, 'movie.metadata.csv')
movie_metadata = pd.read_csv(movie_file_name,  sep="\t")

movie_metadata.columns = [[ "Wikipedia movie ID", 
                            "Freebase movie ID", 
                            "Movie name", 
                            "Movie release date", 
                            "Movie box office revenue", 
                            "Movie runtime", 
                            "Movie languages (Freebase ID:name tuples)",
                            "Movie countries (Freebase ID:name tuples)", 
                            "Movie genres (Freebase ID:name tuples)" ]]

movie_metadata.sample(5) 

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
1491,7641358,/m/0kvd0x,Man in the Dark,1953-04-09,1450000.0,70.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/02xh1"": ""Fil..."
34160,34813793,/m/0j3cl0w,Pension Schöller,1960-07-15,,93.0,"{""/m/04306rv"": ""German Language""}","{""/m/082fr"": ""West Germany""}","{""/m/05p553"": ""Comedy film""}"
62315,31323888,/m/0gjddsy,Der Verlorene Ball,1959,,,"{""/m/04306rv"": ""German Language""}","{""/m/03f2w"": ""German Democratic Republic""}","{""/m/02hmvc"": ""Short Film""}"
12194,6131298,/m/0frt72,The Farmer's Daughter,1973,,58.0,"{""/m/02h40lc"": ""English Language""}",{},"{""/m/01jk9n"": ""Pornographic movie""}"
34583,35281429,/m/0j7h_lw,Kavalukku Kettikaran,1990-01-14,,140.0,"{""/m/07c9s"": ""Tamil Language""}","{""/m/03rk0"": ""India""}",{}


## 3) Plot Summaries

Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia. Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.

In [124]:
# to be added
summary_file_name = os.path.join(PATH_FOLDER, 'plot_summaries.txt')
summary_metadata = pd.read_csv(summary_file_name,  sep="\t")

summary_metadata.columns = [[ "Movie ID",
                              "Movie Summary" ]]

summary_metadata["Movie Summary"] = summary_metadata["Movie Summary"].astype("string")

summary_metadata.sample(5)

Unnamed: 0,Movie ID,Movie Summary
24521,28920951,"Gunnery Sergeant Jim Moore, a Drill Instructor..."
14947,2951463,"A curse hovers over the Preston family, caused..."
37396,25292924,Melinda Uy is a billionaire widow whose life i...
34837,25496280,"Sekhar , Hari ([[Rajendra Prasad , Vasu and t..."
39900,20479465,Inspired by the incredible true story of Lucky...


## 4) Stanford CoreNLP Plot Summaries

The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

In [110]:
# to be added

# TEST DATA

## 1) Tvtropes clusters

72 character types drawn from tvtropes.com, along with 501 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [111]:
tvtropes_file_name = os.path.join(PATH_FOLDER, 'tvtropes.clusters.txt')
tvtropes_metadata = pd.read_csv(tvtropes_file_name, sep="\t")

# define columns
tvtropes_metadata.columns = [ 'character',
                             'data' ]

# Convert the 'data' column to strings
tvtropes_metadata['data'] = tvtropes_metadata['data'].apply(lambda x: str(x))

# Extract and convert the JSON data to separate columns
tvtropes_metadata = pd.DataFrame([(character_type, json.loads(data)) for character_type, data
               in zip(tvtropes_metadata['character'], tvtropes_metadata['data'])])

# redefine columns lost from the previous operation
tvtropes_metadata.columns = ['character', 'data']

# Expand the 'data' column into separate columns
tvtropes_metadata = pd.concat([tvtropes_metadata, tvtropes_metadata['data'].apply(pd.Series)], axis=1)

# Drop the original 'data' column
tvtropes_metadata.drop('data', axis=1, inplace=True)

tvtropes_metadata.columns = [[ 'Character role',
                               'Character name',
                               'Movie name',
                               'Freebase character/actor map ID', 
                               'Actor name' ]]

tvtropes_metadata.sample(5)


Unnamed: 0,Character role,Character name,Movie name,Freebase character/actor map ID,Actor name
440,stoner,Saul Silver,Pineapple Express,/m/02vb3yc,James Franco
29,bounty_hunter,Leonard Smalls,Raising Arizona,/m/02vbtgt,"Randall ""Tex"" Cobb"
8,arrogant_kungfu_guy,Han,Enter the Dragon,/m/02vd8hn,Shih Kien
283,grumpy_old_man,Max Goldman,Grumpy Old Men,/m/02vcwz0,Walter Matthau
90,chanteuse,Dorothy Vallens,Blue Velvet,/m/0jvlc4,Isabella Rossellini


## 2) Name clusters

970 unique character names used in at least two different movies, along with 2,666 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [112]:
name_file_name = os.path.join(PATH_FOLDER, 'name.clusters.txt')
name_metadata = pd.read_csv(name_file_name,  sep="\t")

name_metadata.columns = [[ "character name",
                                "Freebase character/actor map ID" ]]

name_metadata.sample(5)


Unnamed: 0,character name,Freebase character/actor map ID
1115,Velma Dinkley,/m/0k2zvw
554,Edmund Pevensie,/m/0k4q8z
1293,Daffy Duck,/m/0lmnyj7
1823,Prince Humperdinck,/m/0k65z0
587,Tom Brookman,/m/05gsc55


# DATA ANALYSIS & PROCESSING

In [125]:
# to be added