# Imports and folder paths

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import json

PATH_FOLDER = "MovieSummaries/"

# TRAIN DATA

## 1) Character Metadata

Metadata for 450,669 characters aligned to the movies above, extracted from the Noverber 4, 2012 dump of Freebase.  

Tab-separated; columns:
1. Wikipedia movie ID 
2. Freebase movie ID
3. Movie release date
4. Character name
5. Actor date of birth
6. Actor gender
7. Actor height (in meters)
8. Actor ethnicity (Freebase ID)
9. Actor name
10. Actor age at movie release
11. Freebase character/actor map ID
12. Freebase character ID
13. Freebase actor ID

---



In [5]:
character_file_name = os.path.join(PATH_FOLDER, 'character.metadata.tsv')
names_1 = ["Wikipedia movie ID",
            "Freebase movie ID", 
            "Movie release date", 
            "Character name", 
            "Actor date of birth", 
            "Actor gender", 
            "Actor height (in meters)", 
            "Actor ethnicity (Freebase ID)",
            "Actor name", 
            "Actor age at movie release", 
            "Freebase character/actor map ID", 
            "Freebase character ID",
            "Freebase actor ID"]

character_metadata = pd.read_csv(character_file_name,  sep="\t", names= names_1)
character_metadata.sample(5)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
242321,36057866,/m/0j_7hft,1985-07-19,,1948-11-30,F,,/m/0b0gzf,K. R. Vijaya,36.0,/m/0mzt3x7,,/m/04f5d4c
172900,19605841,/m/04n3z42,1975-08-20,,1942-04-30,M,,,Per Pallesen,33.0,/m/09jxs1r,,/m/04n21c1
337087,15094763,/m/03hh6rl,2004-02-19,,,,,,Julia Khanverdieva,,/m/0gccrp1,,/m/0gccrp4
418904,10665065,/m/02qlnzb,1974,,1944-05-29,M,1.82,,Helmut Berger,29.0,/m/02tbcv7,,/m/07pnlb
429510,25413191,/m/09k4j0j,1977-08,,1913-03-15,M,,,Macdonald Carey,,/m/0gbmm10,,/m/0320h5


## 2) Movie Metadata

Metadata for 81,741 movies, extracted from the Noverber 4, 2012 dump of Freebase. 

Tab-separated; columns:
1. Wikipedia movie ID
2. Freebase movie ID
3. Movie name
4. Movie release date
5. Movie box office revenue
6. Movie runtime
7. Movie languages (Freebase ID:name tuples)
8. Movie countries (Freebase ID:name tuples)
9. Movie genres (Freebase ID:name tuples)





In [7]:
movie_file_name = os.path.join(PATH_FOLDER, 'movie.metadata.tsv')
names_2 = ["Wikipedia movie ID", 
            "Freebase movie ID", 
            "Movie name", 
            "Movie release date", 
            "Movie box office revenue", 
            "Movie runtime", 
            "Movie languages (Freebase ID:name tuples)",
            "Movie countries (Freebase ID:name tuples)", 
            "Movie genres (Freebase ID:name tuples)"]

movie_metadata = pd.read_csv(movie_file_name,  sep="\t", names = names_2)
movie_metadata.sample(5) 

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
14632,23738222,/m/06zt4n3,Tokyo Zombie,2005,,103.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/0jb4p32"": ""Zombie Film"", ""/m/0gw5n2f"": ""J..."
22844,25333960,/m/04lqvkt,Frontier of the Dawn,2008,,106.0,"{""/m/064_8sq"": ""French Language""}","{""/m/0f8l9c"": ""France""}","{""/m/068d7h"": ""Romantic drama"", ""/m/02l7c8"": ""..."
9284,13566365,/m/03c9kst,The House Without a Christmas Tree,1972,,75.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0bwgnb"": ""Christmas movie""}"
37573,20719659,/m/051_d1n,Kiddie League,,,,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/0hqxf"": ""Famil..."
9671,31549687,/m/0glp58m,Jolly Boy,2011-06-10,,,{},{},"{""/m/02l7c8"": ""Romance Film"", ""/m/07s9rl0"": ""D..."


## 3) Plot Summaries

Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia. Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.

In [20]:
summary_file_name = os.path.join(PATH_FOLDER, 'plot_summaries.txt')
summary_metadata = pd.read_csv(summary_file_name,  sep="\t", names= ["Movie ID", "Movie Summary"])

summary_metadata["Movie Summary"] = summary_metadata["Movie Summary"].astype("string")
summary_metadata.sample(5)

Unnamed: 0,Movie ID,Movie Summary
11301,35866311,The film includes interviews with several reco...
40238,12789158,"Bubba Mabry , a notoriously gullible private d..."
13669,3778613,{{copy edit}} The prequel is set in Mexico in ...
32022,31700012,"Mavis Gary is a divorced, 37-year-old ghost w..."
17353,4517176,One year since Eiji's climactic but unfinished...


## 4) Stanford CoreNLP Plot Summaries

The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

In [110]:
# to be added

# TEST DATA

## 1) Tvtropes clusters

72 character types drawn from tvtropes.com, along with 501 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [14]:
tvtropes_file_name = os.path.join(PATH_FOLDER, 'tvtropes.clusters.txt')
tvtropes_metadata = pd.read_csv(tvtropes_file_name, sep="\t", names = ['character', 'data'])

# Convert the 'data' column to strings
tvtropes_metadata['data'] = tvtropes_metadata['data'].apply(lambda x: str(x))

# Extract and convert the JSON data to separate columns
tvtropes_metadata = pd.DataFrame([(character_type, json.loads(data)) for character_type, data
               in zip(tvtropes_metadata['character'], tvtropes_metadata['data'])])

# redefine columns lost from the previous operation
tvtropes_metadata.columns = ['character', 'data']

# Expand the 'data' column into separate columns
tvtropes_metadata = pd.concat([tvtropes_metadata, tvtropes_metadata['data'].apply(pd.Series)], axis=1)

# Drop the original 'data' column
tvtropes_metadata.drop('data', axis=1, inplace=True)

tvtropes_metadata.columns = [[ 'Character role',
                               'Character name',
                               'Movie name',
                               'Freebase character/actor map ID', 
                               'Actor name' ]]

tvtropes_metadata.sample(5)


Unnamed: 0,Character role,Character name,Movie name,Freebase character/actor map ID,Actor name
28,bounty_hunter,Beck,The Rundown,/m/0k6bqx,The Rock
432,stoner,Jay,Dogma,/m/0jxdkm,Jason Mewes
499,young_gun,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez
377,prima_donna,Sharpay Evans,High School Musical,/m/0jzdz2,Ashley Tisdale
375,playful_hacker,Mr. Universe,Serenity,/m/0k31lb,David Krumholtz


## 2) Name clusters

970 unique character names used in at least two different movies, along with 2,666 instances of those types. The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.


In [18]:
name_file_name = os.path.join(PATH_FOLDER, 'name.clusters.txt')
name_metadata = pd.read_csv(name_file_name,  sep="\t", names= ["character name", "Freebase character/actor map ID"])
name_metadata.sample(5)

Unnamed: 0,character name,Freebase character/actor map ID
1834,Jesse Huston,/m/02vczvc
1064,Tasmanian Devil,/m/0hynl7d
1945,Porky Pig,/m/052bf45
1327,Rosemary Woodhouse,/m/02vbdfj
1276,Daffy Duck,/m/0gx6xc0


# DATA ANALYSIS & PROCESSING

In [21]:
# Check uniqueness of values in each Series variable
print(summary_metadata["Movie ID"].duplicated().sum()) # contains 0 duplicate values
print(summary_metadata["Movie Summary"].duplicated().sum()) # contains 8 duplicate values

# Remove duplicate values
print(summary_metadata.shape)
summary_metadata = summary_metadata.drop_duplicates("Movie Summary")
print(summary_metadata.shape)

0
8
(42303, 2)
(42295, 2)
