In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px



In [2]:
#Importing the data
path = '/Users/anna/Desktop/MovieSummaries/'
name_clusters = pd.read_csv(path+'name.clusters.txt', delimiter='\t',header=None)
tv_tropes = pd.read_csv(path+'tvtropes.clusters.txt',delimiter='\t',header=None)
plot_summaries = pd.read_csv(path+'plot_summaries.txt',delimiter='\t',header=None)
movie_metadata = pd.read_csv(path+'movie.metadata.tsv',delimiter='\t',header=None)
character_metadata = pd.read_csv(path+'character.metadata.tsv',delimiter='\t',header=None)


In [3]:
#Changing dataframe header titles
name_clusters.columns = ['char_name','char_actor_map_freebase']
tv_tropes.columns = ['type','trope']
movie_metadata.columns = ['Wikipedia_ID','Freebase_ID','movie_name','movie_release_date','revenue','runtime','language_freebase','country_freebase','genre_freebase']
character_metadata.columns = ['Wikipedia_ID','Freebase_ID','movie_release_date','char_name','birthday','gender','height_m',
                              'eth_freebase','actor_name','age_release','char_actor_map_freebase','char_freebase','actor_freebase']


In [4]:
#Cleaning the data, adding columns without the freebase ID
import ast

def extract(id_tuple):
    input_dict = ast.literal_eval(id_tuple)
    second_elements = list(input_dict.values())
    return second_elements  # Join the second elements into a single string


movie_metadata['genre'] = movie_metadata['genre_freebase'].apply(extract)
movie_metadata['language'] = movie_metadata['language_freebase'].apply(extract)
movie_metadata['country'] = movie_metadata['country_freebase'].apply(extract)

movie_metadata

Unnamed: 0,Wikipedia_ID,Freebase_ID,movie_name,movie_release_date,revenue,runtime,language_freebase,country_freebase,genre_freebase,genre,language,country
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America]
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...","[Mystery, Biographical film, Drama, Crime Drama]",[English Language],[United States of America]
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...","[Crime Fiction, Drama]",[Norwegian Language],[Norway]
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...","[Thriller, Erotic thriller, Psychological thri...",[English Language],[United Kingdom]
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",[Drama],[German Language],[Germany]
...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",[Drama],[English Language],[United States of America]
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...","[Biographical film, Drama, Documentary]",[English Language],"[Ireland, United Kingdom]"
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}","[Satire, Comedy]",[English Language],[United States of America]
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...","[Science Fiction, Japanese Movies, Adventure, ...",[Japanese Language],[Japan]


In [5]:
nan_percentage_movie = (movie_metadata.isna().sum() / len(movie_metadata)) * 100
print(nan_percentage_movie)

Wikipedia_ID           0.000000
Freebase_ID            0.000000
movie_name             0.000000
movie_release_date     8.443743
revenue               89.722416
runtime               25.018045
language_freebase      0.000000
country_freebase       0.000000
genre_freebase         0.000000
genre                  0.000000
language               0.000000
country                0.000000
dtype: float64


We see that the biggest percentage of missing values in our movie data is in the 'revenue' column. We have therefore decided to not use this data as a succes standard continuing with our analysis since it will restrict our data points too much. Instead, we decided to work with IMDB ratingsa instead.

In [6]:
movie_metadata = movie_metadata.drop(columns=['revenue'])

In [7]:
nan_percentage_character = (character_metadata.isna().sum() / len(character_metadata)) * 100
print(nan_percentage_character)

Wikipedia_ID                0.000000
Freebase_ID                 0.000000
movie_release_date          2.217814
char_name                  57.220488
birthday                   23.552763
gender                     10.120288
height_m                   65.645740
eth_freebase               76.466542
actor_name                  0.272484
age_release                35.084064
char_actor_map_freebase     0.000000
char_freebase              57.218269
actor_freebase              0.180842
dtype: float64


In [21]:
#Loading IMDB ID data
imdb_map_id = pd.read_csv('/Users/anna/Desktop/query.tsv',delimiter='\t')
imdb_map_id=imdb_map_id.rename(columns={"freebaseID": "Freebase_ID"})
imdb_map_id=imdb_map_id.drop(columns=['item'])

#Merging movie_metadata with IMDB ID
movie_imdb = movie_metadata.merge(imdb_map_id, how='inner',on=['Freebase_ID'])

Unnamed: 0,Wikipedia_ID,Freebase_ID,movie_name,movie_release_date,runtime,language_freebase,country_freebase,genre_freebase,genre,language,country,imdbID
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],tt0228333
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...","[Mystery, Biographical film, Drama, Crime Drama]",[English Language],[United States of America],tt0245916
2,28463795,/m/0crgdbh,Brun bitter,1988,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...","[Crime Fiction, Drama]",[Norwegian Language],[Norway],tt0094806
3,9363483,/m/0285_cd,White Of The Eye,1987,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...","[Thriller, Erotic thriller, Psychological thri...",[English Language],[United Kingdom],tt0094320
4,261236,/m/01mrr1,A Woman in Flames,1983,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",[Drama],[German Language],[Germany],tt0083949
...,...,...,...,...,...,...,...,...,...,...,...,...
74347,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",[Drama],[English Language],[United States of America],tt1816585
74348,34980460,/m/0g4pl34,Knuckle,2011-01-21,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...","[Biographical film, Drama, Documentary]",[English Language],"[Ireland, United Kingdom]",tt1606259
74349,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}","[Satire, Comedy]",[English Language],[United States of America],tt0362411
74350,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...","[Science Fiction, Japanese Movies, Adventure, ...",[Japanese Language],[Japan],tt0113726


In [26]:
#Loading IMDB rating data

imdb_rating = pd.read_csv('/Users/anna/Desktop/rating.tsv',delimiter='\t')
imdb_rating = imdb_rating.rename(columns={"tconst": "imdbID"})

#Merging movie data with IMDB ratings
movie_rating = movie_imdb.merge(imdb_rating,how='inner',on=['imdbID'])
movie_rating

Unnamed: 0,Wikipedia_ID,Freebase_ID,movie_name,movie_release_date,runtime,language_freebase,country_freebase,genre_freebase,genre,language,country,imdbID,averageRating,numVotes
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","[Thriller, Science Fiction, Horror, Adventure,...",[English Language],[United States of America],tt0228333,4.9,56928
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...","[Mystery, Biographical film, Drama, Crime Drama]",[English Language],[United States of America],tt0245916,6.0,69
2,28463795,/m/0crgdbh,Brun bitter,1988,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...","[Crime Fiction, Drama]",[Norwegian Language],[Norway],tt0094806,5.6,41
3,9363483,/m/0285_cd,White Of The Eye,1987,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...","[Thriller, Erotic thriller, Psychological thri...",[English Language],[United Kingdom],tt0094320,6.1,2895
4,261236,/m/01mrr1,A Woman in Flames,1983,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",[Drama],[German Language],[Germany],tt0083949,5.9,623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68394,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",[Drama],[English Language],[United States of America],tt1816585,4.6,1711
68395,34980460,/m/0g4pl34,Knuckle,2011-01-21,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...","[Biographical film, Drama, Documentary]",[English Language],"[Ireland, United Kingdom]",tt1606259,6.8,3194
68396,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}","[Satire, Comedy]",[English Language],[United States of America],tt0362411,5.8,112
68397,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...","[Science Fiction, Japanese Movies, Adventure, ...",[Japanese Language],[Japan],tt0113726,6.0,657
