## 1. LOAD AND CLEAN CMU DATASETS

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import ast



!!CHANGE YOUR PATH in read_csv call, in next cell and also when reading query.tsv, ratingd.tsv and budget (movies_metadata from Kaggle)

link to download the needed csv/tsv files:
query.tsv : https://query.wikidata.org/#PREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0A%0ASELECT%20%3Fitem%20%3FfreebaseID%20%3FimdbID%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP31%2Fwdt%3AP279%2a%20wd%3AQ11424.%0A%20%20%3Fitem%20wdt%3AP646%20%3FfreebaseID.%0A%20%20%3Fitem%20wdt%3AP345%20%3FimdbID.%0A%20%20%7D
then run the code an download the result

ratings.csv : https://datasets.imdbws.com download title.ratingd.tsv.giz

budget : https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/discussion/333586, download the whole directory (need an account, fast to create) and then we use the movies_metadata.csv file

In [64]:
#Importing the data

path = '/Users/camille/Desktop/ada/PROJECT/data/MovieSummaries/'

name_clusters = pd.read_csv(path+'name.clusters.txt', delimiter='\t',header=None)
tv_tropes = pd.read_csv(path+'tvtropes.clusters.txt',delimiter='\t',header=None)
plot_summaries = pd.read_csv(path+'plot_summaries.txt',delimiter='\t',header=None)
movie_metadata = pd.read_csv(path+'movie.metadata.tsv',delimiter='\t',header=None)
character_metadata = pd.read_csv(path+'character.metadata.tsv',delimiter='\t',header=None)


In [65]:
# defining columns names for all datasets

movie_columns = ['Wikipedia_ID','Freebase_ID','movie_name','movie_release_date','revenue','runtime',
                 'language_freebase','country_freebase','genre_freebase']
character_columns = ['Wikipedia_ID','Freebase_ID','movie_release_date','char_name','birthday','gender','height_m',
                            'eth_freebase','actor_name','age_release','char_actor_map_freebase','char_freebase','actor_freebase']
tv_tropes_columns = ['type','trope']
names_clusters_columns = ['char_name','char_actor_map_freebase']

movie_metadata.columns = movie_columns
character_metadata.columns = character_columns
tv_tropes.columns = tv_tropes_columns
name_clusters.columns = names_clusters_columns

In [67]:
#Cleaning the data for language, genre and country columns

def extract(id_tuple):
    input_dict = ast.literal_eval(id_tuple)
    second_elements = list(input_dict.values())
    return second_elements  # Join the second elements into a single string


movie_metadata['genre'] = movie_metadata['genre_freebase'].apply(extract)
movie_metadata['language'] = movie_metadata['language_freebase'].apply(extract)
movie_metadata['country'] = movie_metadata['country_freebase'].apply(extract)

movie_metadata = movie_metadata.drop(columns=['language_freebase', 'country_freebase','genre_freebase'])
                                     
# Cleaning the language column, ex: [English Language, Russian Language] becomes [English, Russian]
def clean_language(input_strs):
    clean_strs = []
    for i in range(len(input_strs)):
        clean_str = input_strs[i].split('Language')[0].strip()
        clean_strs.append(clean_str)
    return clean_strs

movie_metadata['language'] = movie_metadata['language'].apply(clean_language)


movie_metadata

Unnamed: 0,Wikipedia_ID,Freebase_ID,movie_name,movie_release_date,revenue,runtime,genre,language,country
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English],[United States of America]
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"[Mystery, Biographical film, Drama, Crime Drama]",[English],[United States of America]
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"[Crime Fiction, Drama]",[Norwegian],[Norway]
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"[Thriller, Erotic thriller, Psychological thri...",[English],[United Kingdom]
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,[Drama],[German],[Germany]
...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,[Drama],[English],[United States of America]
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"[Biographical film, Drama, Documentary]",[English],"[Ireland, United Kingdom]"
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"[Satire, Comedy]",[English],[United States of America]
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"[Science Fiction, Japanese Movies, Adventure, ...",[Japanese],[Japan]


In [68]:
nan_percentage_movie = (movie_metadata.isna().sum() / len(movie_metadata)) * 100
print(nan_percentage_movie)

Wikipedia_ID           0.000000
Freebase_ID            0.000000
movie_name             0.000000
movie_release_date     8.443743
revenue               89.722416
runtime               25.018045
genre                  0.000000
language               0.000000
country                0.000000
dtype: float64


We see that the biggest percentage of missing values in our movie data is in the 'revenue' column. We have therefore decided to not use this data as a succes standard continuing with our analysis since it will restrict our data points too much. Instead, we decided to work with IMDB ratingsa instead.

In [69]:
movie_metadata = movie_metadata.drop(columns=['revenue'])

In [70]:
nan_percentage_character = (character_metadata.isna().sum() / len(character_metadata)) * 100
print(nan_percentage_character)

Wikipedia_ID                0.000000
Freebase_ID                 0.000000
movie_release_date          2.217814
char_name                  57.220488
birthday                   23.552763
gender                     10.120288
height_m                   65.645740
eth_freebase               76.466542
actor_name                  0.272484
age_release                35.084064
char_actor_map_freebase     0.000000
char_freebase              57.218269
actor_freebase              0.180842
dtype: float64


We can see that we have a lot of missing values for character names, but we will keep this columns as we need it for our future analysis on sequels??

## 2. MERGING DATASETS 
2.1 : movie_ratings : merge movie_metadata with IMDB ratings (using first a merge with the map of IMDB id and Freebase id

In [71]:
#Loading IMDB ID data
imdb_map_id = pd.read_csv('/Users/camille/Desktop/ada/PROJECT/data/query.tsv',delimiter='\t')
#same name as in movie_metadata
imdb_map_id=imdb_map_id.rename(columns={"freebaseID": "Freebase_ID"})
imdb_map_id=imdb_map_id.drop(columns=['item'])

#Merging movie_metadata with IMDB ID
movie_imdb = movie_metadata.merge(imdb_map_id, how='inner',on=['Freebase_ID'])
movie_imdb

Unnamed: 0,Wikipedia_ID,Freebase_ID,movie_name,movie_release_date,runtime,genre,language,country,imdbID
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English],[United States of America],tt0228333
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,95.0,"[Mystery, Biographical film, Drama, Crime Drama]",[English],[United States of America],tt0245916
2,28463795,/m/0crgdbh,Brun bitter,1988,83.0,"[Crime Fiction, Drama]",[Norwegian],[Norway],tt0094806
3,9363483,/m/0285_cd,White Of The Eye,1987,110.0,"[Thriller, Erotic thriller, Psychological thri...",[English],[United Kingdom],tt0094320
4,261236,/m/01mrr1,A Woman in Flames,1983,106.0,[Drama],[German],[Germany],tt0083949
...,...,...,...,...,...,...,...,...,...
74347,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,120.0,[Drama],[English],[United States of America],tt1816585
74348,34980460,/m/0g4pl34,Knuckle,2011-01-21,96.0,"[Biographical film, Drama, Documentary]",[English],"[Ireland, United Kingdom]",tt1606259
74349,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,66.0,"[Satire, Comedy]",[English],[United States of America],tt0362411
74350,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,150.0,"[Science Fiction, Japanese Movies, Adventure, ...",[Japanese],[Japan],tt0113726


In [72]:
#Loading IMDB rating data

imdb_rating = pd.read_csv('/Users/camille/Desktop/ada/PROJECT/data/ratings.tsv',delimiter='\t')
imdb_rating = imdb_rating.rename(columns={"tconst": "imdbID"})

#Merging movie data with IMDB ratings
movie_rating = movie_imdb.merge(imdb_rating,how='inner',on=['imdbID'])
movie_rating

# movie_rating.to_csv('data/' + 'movie_rating.csv', index=False) ?? 

Unnamed: 0,Wikipedia_ID,Freebase_ID,movie_name,movie_release_date,runtime,genre,language,country,imdbID,averageRating,numVotes
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English],[United States of America],tt0228333,4.9,56928
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,95.0,"[Mystery, Biographical film, Drama, Crime Drama]",[English],[United States of America],tt0245916,6.0,69
2,28463795,/m/0crgdbh,Brun bitter,1988,83.0,"[Crime Fiction, Drama]",[Norwegian],[Norway],tt0094806,5.6,41
3,9363483,/m/0285_cd,White Of The Eye,1987,110.0,"[Thriller, Erotic thriller, Psychological thri...",[English],[United Kingdom],tt0094320,6.1,2895
4,261236,/m/01mrr1,A Woman in Flames,1983,106.0,[Drama],[German],[Germany],tt0083949,5.9,623
...,...,...,...,...,...,...,...,...,...,...,...
68394,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,120.0,[Drama],[English],[United States of America],tt1816585,4.6,1711
68395,34980460,/m/0g4pl34,Knuckle,2011-01-21,96.0,"[Biographical film, Drama, Documentary]",[English],"[Ireland, United Kingdom]",tt1606259,6.8,3194
68396,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,66.0,"[Satire, Comedy]",[English],[United States of America],tt0362411,5.8,112
68397,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,150.0,"[Science Fiction, Japanese Movies, Adventure, ...",[Japanese],[Japan],tt0113726,6.0,657


2.2 : movie_ratings_budget
(only for vicky to use the movie_ratings_budgets data set produced)

In [73]:
# budgets data set cleaning

#specified low_memory to False because warning was given that column 10 had mixed types
kaggle_movies = pd.read_csv('/Users/camille/Desktop/ada/PROJECT/data/Kaggle/movies_metadata.csv', low_memory=False)

budgets = kaggle_movies[['imdb_id','budget']].copy()

def to_numeric(x):
    if x.isnumeric():
        return x
    else:
        return "0"

# put every non-numeric values to "0", then drop all values 0 for budget
budgets['budget'] = budgets['budget'].apply(to_numeric)  
budgets['budget'] = budgets['budget'].apply(int)  

budgets.loc[budgets['budget'] == 0, 'budget'] = np.nan # explanation : it is like a .loc[row indexer, col indexer]
budgets = budgets.dropna(subset = ['budget'])

# budgets dataset merging
budgets = budgets.rename(columns={"imdb_id": "imdbID"})
movie_ratings_budgets = movie_rating.merge(budgets,how='inner',on=['imdbID'])
movie_ratings_budgets

Unnamed: 0,Wikipedia_ID,Freebase_ID,movie_name,movie_release_date,runtime,genre,language,country,imdbID,averageRating,numVotes,budget
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,98.0,"[Thriller, Science Fiction, Horror, Adventure,...",[English],[United States of America],tt0228333,4.9,56928,28000000.0
1,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,106.0,"[Musical, Comedy, Black-and-white]",[English],[United States of America],tt0029852,6.8,2268,2000000.0
2,171005,/m/016ywb,Henry V,1989-11-08,137.0,"[Costume drama, War film, Epic, Period piece, ...",[English],[United Kingdom],tt0097499,7.5,31208,9000000.0
3,77856,/m/0kcn7,Mary Poppins,1964-08-27,139.0,"[Children's/Family, Musical, Fantasy, Comedy, ...",[English],[United States of America],tt0058331,7.8,181829,6000000.0
4,77856,/m/0kcn7,Mary Poppins,1964-08-27,139.0,"[Children's/Family, Musical, Fantasy, Comedy, ...",[English],[United States of America],tt0058331,7.8,181829,6000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6971,303933,/m/01s7w3,Twister,1996-05-10,113.0,"[Action/Adventure, Disaster]",[English],[United States of America],tt0117998,6.5,207829,92000000.0
6972,25920477,/m/0b6lqyd,Source Code,2011-03-11,93.0,"[Thriller, Science Fiction, Action/Adventure, ...",[English],"[France, United States of America]",tt0945513,7.5,540315,32000000.0
6973,54540,/m/0f7hw,Coming to America,1988-06-29,117.0,"[Romantic comedy, Comedy of manners, Drama, Co...",[English],[United States of America],tt0094898,7.1,218976,39000000.0
6974,7761830,/m/0kvgqb,Spaced Invaders,1990,100.0,"[Alien Film, Science Fiction, Family Film, Com...",[English],[United States of America],tt0100666,5.3,3885,3000000.0


2.3 movie_character
anna you can put your merge here maybe?

## 3. FIRST ANALYSIS

- language availability

In [62]:
languages = movie_rating['language'].apply(lambda x: pd.Series(x)).stack().unique()

In [None]:
len(languages)

plot the 10 most used languages

for this 10 plot the average success

remove the only English one, and in others:
- those you dont have english at all: success? : plot the 10 most used one, then their sucess
- compraed to those that have other + english


en gros: can a movie have a big success if it is not avalailable in english 