In [33]:
import pandas as pd
import json
from IPython.display import display

In [34]:
# Data source: http://www.cs.cmu.edu/~ark/personas/

# Define the paths
DATA_PATH = 'data/'
NEW_DATA_PATH = 'clean_data/'

# 1. Reformat the data to have a clean Database

In [35]:
# We do not use the name.clusters.txt file as it is quite useless for our purpose

In [36]:
# Separate the dict columns for the movies table

# Load the data
movies_df = pd.read_csv(DATA_PATH + 'movie.metadata.tsv', delimiter='\t', header=None, names=['wiki_id', 'id', 'name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'])
display(movies_df.head())

# Separate the languages to another table
movies_df["languages"] = movies_df["languages"].apply(lambda x: list(json.loads(x).values()))
movies_languages_df = movies_df[["id", "languages"]].explode("languages").reset_index(drop=True)
movies_languages_df = movies_languages_df.rename(columns={"languages": "language", "id": "movie_id"})
display(movies_languages_df.head())

# Separate the genres to another table
movies_df["genres"] = movies_df["genres"].apply(lambda x: list(json.loads(x).values()))
movies_genres_df = movies_df[["id", "genres"]].explode("genres").reset_index(drop=True)
movies_genres_df = movies_genres_df.rename(columns={"genres": "genre", "id": "movie_id"})
display(movies_genres_df.head())

# Separate the countries to another table
movies_df["countries"] = movies_df["countries"].apply(lambda x: list(json.loads(x).values()))
movies_countries_df = movies_df[["id", "countries"]].explode("countries").reset_index(drop=True)
movies_countries_df = movies_countries_df.rename(columns={"countries": "country", "id": "movie_id"})
display(movies_countries_df.head())

# Drop the useless remaining columns from the movies_df
movies_df = movies_df.drop(columns=["languages", "genres", "countries"])
display(movies_df.head())

Unnamed: 0,wiki_id,id,name,release_date,revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


Unnamed: 0,movie_id,language
0,/m/03vyhn,English Language
1,/m/08yl5d,English Language
2,/m/0crgdbh,Norwegian Language
3,/m/0285_cd,English Language
4,/m/01mrr1,German Language


Unnamed: 0,movie_id,genre
0,/m/03vyhn,Thriller
1,/m/03vyhn,Science Fiction
2,/m/03vyhn,Horror
3,/m/03vyhn,Adventure
4,/m/03vyhn,Supernatural


Unnamed: 0,movie_id,country
0,/m/03vyhn,United States of America
1,/m/08yl5d,United States of America
2,/m/0crgdbh,Norway
3,/m/0285_cd,United Kingdom
4,/m/01mrr1,Germany


Unnamed: 0,wiki_id,id,name,release_date,revenue,runtime
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0


In [37]:
# Add the summary to the movie table (as there is a summary for each of the movie, i.e. no null)

# Load the data
movies_summaries_df = pd.read_csv(DATA_PATH + 'plot_summaries.txt', delimiter='\t', header=None, names=['wiki_id', 'summary'])
display(movies_summaries_df.head())

# Merge the summaries with the movies table
movies_df = pd.merge(movies_df, movies_summaries_df, on="wiki_id")
display(movies_df.head())

Unnamed: 0,wiki_id,summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


Unnamed: 0,wiki_id,id,name,release_date,revenue,runtime,summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"Set in the second half of the 22nd century, th..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,A series of murders of rich young women throug...
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"Eva, an upper class housewife, becomes frustra..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"Every hundred years, the evil Morgana returns..."
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"Adam, a San Francisco-based artist who works a..."


In [38]:
# Store the characters in a separate table

# Load the data
characters_df = pd.read_csv(DATA_PATH + 'character.metadata.tsv', delimiter='\t', header=None, names=['movie_wiki_id', 'movie_id', 'movie_release_date', 'name', 'actor_birth_date', 'actor_gender', 'actor_height','actor_ethinicity_id', 'actor_name', 'actor_age', 'actor_map_id', 'id', 'actor_id']) 
display(characters_df.head())

Unnamed: 0,movie_wiki_id,movie_id,movie_release_date,name,actor_birth_date,actor_gender,actor_height,actor_ethinicity_id,actor_name,actor_age,actor_map_id,id,actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [39]:
# Clean the characters categories table

# Load the data
characters_categories_df = pd.read_csv(DATA_PATH + 'tvtropes.clusters.txt', delimiter='\t', header=None, names=['category', 'meta'])
display(characters_categories_df.head())

# Fetch the characters ids from the meta column, and remove the useless columns
characters_categories_df["meta"] = characters_categories_df["meta"].apply(lambda x: json.loads(x))
characters_categories_df["character_id"] = characters_categories_df["meta"].apply(lambda x: x["id"])
characters_categories_df = characters_categories_df.drop(columns=["meta"])
display(characters_categories_df.head())

Unnamed: 0,category,meta
0,absent_minded_professor,"{""char"": ""Professor Philip Brainard"", ""movie"":..."
1,absent_minded_professor,"{""char"": ""Professor Keenbean"", ""movie"": ""Richi..."
2,absent_minded_professor,"{""char"": ""Dr. Reinhardt Lane"", ""movie"": ""The S..."
3,absent_minded_professor,"{""char"": ""Dr. Harold Medford"", ""movie"": ""Them!..."
4,absent_minded_professor,"{""char"": ""Daniel Jackson"", ""movie"": ""Stargate""..."


Unnamed: 0,category,character_id
0,absent_minded_professor,/m/0jy9q0
1,absent_minded_professor,/m/02vchl3
2,absent_minded_professor,/m/0k6fkc
3,absent_minded_professor,/m/0k6_br
4,absent_minded_professor,/m/0k3rhh


In [40]:
# Save the data
movies_df.to_csv(NEW_DATA_PATH + 'movies.csv', index=False)
movies_languages_df.to_csv(NEW_DATA_PATH + 'movies_languages.csv', index=False)
movies_genres_df.to_csv(NEW_DATA_PATH + 'movies_genres.csv', index=False)
movies_countries_df.to_csv(NEW_DATA_PATH + 'movies_countries.csv', index=False)
characters_df.to_csv(NEW_DATA_PATH + 'characters.csv', index=False)
characters_categories_df.to_csv(NEW_DATA_PATH + 'characters_categories.csv', index=False)

# 2. Group the movies into categories

In [50]:
groups = {
	"Northen America": set(["United States of America", "Canada"]),
	"Europe": set(["United Kingdom", "France", "Italy", "Germany", "Spain", "West Germany", "Belgium", "German Democratic Republic", "Ireland", "Switzerland", "Austria", "England", "Luxembourg", "Portugal"]),
	"Asia": set(["Japan", "Hong Kong", "China", "South Korea", "Taiwan"]),
	"India": set(["India", "Pakistan"])
}

In [51]:
# Load the data
movies_df = pd.read_csv(NEW_DATA_PATH + 'movies.csv')
display(movies_df.head())
movies_countries_df = pd.read_csv(NEW_DATA_PATH + 'movies_countries.csv')
display(movies_countries_df.head())

# Merge the movies and the countries (the movies can appear multiple times, as they can have multiple countries)
movies_df = pd.merge(movies_df, movies_countries_df, left_on="id", right_on="movie_id")
display(movies_df.head())

Unnamed: 0,wiki_id,id,name,release_date,revenue,runtime,summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"Set in the second half of the 22nd century, th..."
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,A series of murders of rich young women throug...
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"Eva, an upper class housewife, becomes frustra..."
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"Every hundred years, the evil Morgana returns..."
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"Adam, a San Francisco-based artist who works a..."


Unnamed: 0,movie_id,country
0,/m/03vyhn,United States of America
1,/m/08yl5d,United States of America
2,/m/0crgdbh,Norway
3,/m/0285_cd,United Kingdom
4,/m/01mrr1,Germany


Unnamed: 0,wiki_id,id,name,release_date,revenue,runtime,summary,movie_id,country
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"Set in the second half of the 22nd century, th...",/m/03vyhn,United States of America
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,A series of murders of rich young women throug...,/m/0285_cd,United Kingdom
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"Eva, an upper class housewife, becomes frustra...",/m/01mrr1,Germany
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"Every hundred years, the evil Morgana returns...",/m/04jcqvw,South Africa
4,6631279,/m/0gffwj,Little city,1997-04-04,,93.0,"Adam, a San Francisco-based artist who works a...",/m/0gffwj,United States of America


In [53]:
# Separate the countries by groups

# Northen America
northen_america_movies_df = movies_df[movies_df["country"].isin(groups["Northen America"])]["id"].unique()
northen_america_movies_df = pd.DataFrame(northen_america_movies_df, columns=["movie_id"])
northen_america_movies_df.to_csv(NEW_DATA_PATH + 'northen_america_movies.csv', index=False)
print(f"There are {len(northen_america_movies_df)} movies from Northen America")

# Europe
europe_movies_df = movies_df[movies_df["country"].isin(groups["Europe"])]["id"].unique()
europe_movies_df = pd.DataFrame(europe_movies_df, columns=["movie_id"])
europe_movies_df.to_csv(NEW_DATA_PATH + 'europe_movies.csv', index=False)
print(f"There are {len(europe_movies_df)} movies from Europe")

# Asia
asia_movies_df = movies_df[movies_df["country"].isin(groups["Asia"])]["id"].unique()
asia_movies_df = pd.DataFrame(asia_movies_df, columns=["movie_id"])
asia_movies_df.to_csv(NEW_DATA_PATH + 'asia_movies.csv', index=False)
print(f"There are {len(asia_movies_df)} movies from Asia")

# India
india_movies_df = movies_df[movies_df["country"].isin(groups["India"])]["id"].unique()
india_movies_df = pd.DataFrame(india_movies_df, columns=["movie_id"])
india_movies_df.to_csv(NEW_DATA_PATH + 'india_movies.csv', index=False)
print(f"There are {len(india_movies_df)} movies from India")

There are 21642 movies from Northen America
There are 8301 movies from Europe
There are 3187 movies from Asia
There are 4827 movies from India
