In [365]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import requests
from SPARQLWrapper import SPARQLWrapper, JSON

## Load data

In [366]:
data_folder = '../projet/Data/MovieSummaries/'
column_names= ['Wiki_ID', 'Movie_ID','title','release_date','BoxOfficeRevenue','Runtime','Languages','Countries','Genres']
data_original=pd.read_csv(data_folder + 'movie.metadata.tsv', sep = '\t', names=column_names)

column_names= ['Wiki_ID', 'Movie_ID','release_date','character_name','actor_birth_date','actor_gender','actor_height','ethnicity_ID','actor_name','actor_age_movie_released','character/actor_ID','character_ID','Actor_ID','Movie_title','release_year', 'birth_year','Movie_Count','ethnicity']
data_character=pd.read_csv(data_folder + 'character.metadata.tsv', sep = '\t', names=column_names)

In [367]:
data_folder = '../projet/Data/'
data_success = pd.read_csv(data_folder+'movie_data_successmetric_rating.csv')
data_success = data_success.rename(columns={'pondered_rating': 'rating'})

Adding the rating we go from 81 741 movies to 46 737 movies (lost 35 004 movies so 43%)

In [368]:
#release date given either in YYYY-MM-DD or YYYY so we extract the year release such
data_original['Year'] = data_original['release_date'].astype(str).str[:4]
data_original['Year'] = pd.to_numeric(data_original['Year'], errors='coerce').astype('Int64')

Add a column with title of movie in data_character:

In [369]:
data_character = data_character.merge(
    data_original[['Movie_ID', 'title']],
    on='Movie_ID',
    how='left'
)
data_character.rename(columns={'title': 'Movie_title'}, inplace=True)

# Cleaning Movie metadata

In [370]:
data_original.describe()

Unnamed: 0,Wiki_ID,BoxOfficeRevenue,Runtime,Year
count,81741.0,8401.0,61291.0,74839.0
mean,17407840.0,47993630.0,111.8192,1977.47653
std,10987910.0,112175300.0,4360.07,29.101536
min,330.0,10000.0,0.0,1010.0
25%,7323695.0,2083193.0,81.0,1956.0
50%,17778990.0,10639690.0,93.0,1985.0
75%,27155730.0,40716960.0,106.0,2004.0
max,37501920.0,2782275000.0,1079281.0,2016.0


Min and max BoxOfficeRevenue are coherent

Run time min and max are not coherent

Min year is not coherent

In [371]:
data_original.sort_values(by = 'Runtime', ascending =False)[:5]

Unnamed: 0,Wiki_ID,Movie_ID,title,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres,Year
12804,10815585,/m/02qqy23,Zero Tolerance,1995,,1079281.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0lsxr"": ""Crime F...",1995.0
62962,32441022,/m/0gyr803,Modern Times Forever,,,14400.0,{},{},"{""/m/0jtdp"": ""Documentary""}",
71100,25345684,/m/09gqhh3,Cinématon,2011,,9360.0,"{""/m/064_8sq"": ""French Language""}","{""/m/0f8l9c"": ""France""}","{""/m/0424mc"": ""Experimental film"", ""/m/0jtdp"":...",2011.0
21733,14545195,/m/03qcghh,Matrjoschka,2006-04-23,,5700.0,{},"{""/m/0345h"": ""Germany""}","{""/m/0219x_"": ""Indie"", ""/m/0jtdp"": ""Documentary""}",2006.0
46666,884435,/m/03lmv2,The Cure for Insomnia,1987-01-31,,5220.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4lw"": ""Art film""}",1987.0


The first right value is Modern Times Forever (240h long), we will remove the longer films

In [372]:
data_original.sort_values(by = 'Runtime', ascending =True)[:5]

Unnamed: 0,Wiki_ID,Movie_ID,title,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres,Year
42478,786716,/m/03c6bq,Dickson Experimental Sound Film,1894.0,,0.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film""}",1894.0
57963,1082508,/m/044ggd,Roundhay Garden Scene,1888.0,,0.03,"{""/m/06ppq"": ""Silent film""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",1888.0
55686,26044155,/m/0b6f62m,Sallie Gardner at a Gallop,,,0.05,"{""/m/06ppq"": ""Silent film""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",
25984,644824,/m/02_1qn,Dickson Greeting,1891.0,,0.05,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/0219x_"": ""Indie""}",1891.0
7486,32175981,/m/0gy0t95,Men Boxing,1891.0,,0.083333,"{""/m/06ppq"": ""Silent film""}",{},"{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",1891.0


All short movies do exist. There exist indeed really short films less than a minute.

In [373]:
wrong_values = data_original[
    (data_original['Runtime'] <= 0) |
    (data_original['Runtime'] > 14400)
]

data_original.loc[wrong_values.index, 'Runtime'] = pd.NA

print(f"There are {wrong_values.shape[0]} movies with invalid runtimes. These have been replaced with NaNs.")


There are 2 movies with invalid runtimes. These have been replaced with NaNs.


In [374]:
data_original.sort_values(by = 'Year', ascending =True)[:5]

Unnamed: 0,Wiki_ID,Movie_ID,title,release_date,BoxOfficeRevenue,Runtime,Languages,Countries,Genres,Year
62836,29666067,/m/0fphzrf,Hunting Season,1010-12-02,12160978.0,140.0,"{""/m/02hwyss"": ""Turkish Language"", ""/m/02h40lc...","{""/m/01znc_"": ""Turkey""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/02n4kr"": ""My...",1010
57963,1082508,/m/044ggd,Roundhay Garden Scene,1888,,0.03,"{""/m/06ppq"": ""Silent film""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",1888
38764,12170539,/m/02vs8rb,"Leisurely Pedestrians, Open Topped Buses and H...",1889,,,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/07ssc"": ""United Kingdom""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",1889
53907,6431996,/m/0g53t3,"Monkeyshines, No. 1",1890,,,"{""/m/06ppq"": ""Silent film""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06ppq"": ""Silent film""}",1890
60249,22770416,/m/0gj9h_4,London's Trafalgar Square,1890,,,"{""/m/06ppq"": ""Silent film""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen...",1890


In [375]:
data_original.loc[data_original['title'] == 'Hunting Season','Year'] = 2010
data_original.loc[data_original['title'] == 'Hunting Season','release_date'] = 2010

Only the first movie is wrong but since the cineam has been invented in 1895 only the films after that are reall movies and not only short videos

In [376]:
print(f"Number of movies before {data_original.shape[0]}")

wrong_values = data_original[
    (data_original['Year'] <= 1895) |
    (data_original['Year'] >= 2013)
]

removed_movies = data_original.iloc[wrong_values.index]['Movie_ID']
data_original.drop(wrong_values.index, axis=0, inplace=True)

print(f"There are {wrong_values.shape[0]} movies with invalid release_date. These have been replaced removed.")

print(f"Number of movies after {data_original.shape[0]}")

Number of movies before 81741
There are 228 movies with invalid release_date. These have been replaced removed.
Number of movies after 81513


### Remove the movies from the other datasets:

In [377]:
def remove_movies_from_df(df,data_name):
    shape_before = df.shape[0]
    rows_to_drop = df[df['Movie_ID'].isin(removed_movies)]
    df.drop(rows_to_drop.index, axis=0, inplace=True)
    print(f"{shape_before - df.shape[0]} elements info from {data_name} removed")

In [378]:
remove_movies_from_df(data_character, 'data_character')
remove_movies_from_df(data_success, 'data_sucess')

999 elements info from data_character removed
0 elements info from data_sucess removed


### Removing duplicates

In [379]:
before_size = data_original.shape[0]
unique_actors_df = data_original.drop_duplicates(subset=['Movie_ID'], keep='first')
print(f"Removed {before_size-unique_actors_df.shape[0]} duplicates")

Removed 0 duplicates


## Cleaning character data_frame

### Removing duplicates

In [380]:
original_dimension = data_character.shape[0]
data_character.drop_duplicates(subset=['Actor_ID', 'Movie_ID','character_ID'], keep= 'first', inplace=True)

print(f"Removing {original_dimension - data_character.shape[0]} duplicates")

Removing 316 duplicates


### Actors' date and age

In [381]:
data_character.describe()

Unnamed: 0,Wiki_ID,actor_height,actor_age_movie_released,Movie_title,release_year,birth_year,Movie_Count,ethnicity
count,449354.0,154098.0,291663.0,0.0,0.0,0.0,0.0,0.0
mean,13925330.0,1.78903,37.7793,,,,,
std,10768980.0,4.390239,20.604441,,,,,
min,330.0,0.61,-7896.0,,,,,
25%,3751746.0,1.6764,28.0,,,,,
50%,11825920.0,1.75,36.0,,,,,
75%,23593180.0,1.83,47.0,,,,,
max,37501920.0,510.0,103.0,,,,,


There are incorrect values of age, replace with nan value

In [382]:
wrong_values = data_character[
    (data_character['actor_age_movie_released'] <= 5) |
    (data_character['actor_age_movie_released'] >= 90)
]

data_character.loc[wrong_values.index, 'actor_age_movie_released'] = pd.NA

print(f"There are {wrong_values.shape[0]} actors with invalid ages (<= 5 or >= 90). These have been replaced with NaNs.")

There are 1085 actors with invalid ages (<= 5 or >= 90). These have been replaced with NaNs.


Adds release year missing in character_data present in data_original:

In [383]:
print(f"There are {data_original['Year'].isna().sum()} missing values in realised year in metadata of movies")
print(f"There are {data_character['release_date'].isna().sum()} missing values in realised year in character dataframe")
print(f"There are {data_character['actor_birth_date'].isna().sum()} missing values in actor birth date year in characters dataframe")

There are 6902 missing values in realised year in metadata of movies
There are 9994 missing values in realised year in character dataframe
There are 105749 missing values in actor birth date year in characters dataframe


In [384]:
merged_df = pd.merge(data_character, data_original[['Movie_ID', 'Year']], on='Movie_ID', suffixes=('_women', '_imdb'), how='left')
data_character['release_year'] = merged_df['Year']

data_character['birth_year'] = data_character['actor_birth_date'].astype(str).str[:4]
data_character['birth_year'] = pd.to_numeric(data_character['birth_year'], errors='coerce').astype('Int64')

In [385]:
print(f"Missing values before process: {data_character['actor_age_movie_released'].isna().sum()}")

nan_indices = data_character[data_character['actor_age_movie_released'].isna()].index
calculated_age = (data_character.loc[nan_indices, 'release_year'] - data_character.loc[nan_indices, 'birth_year']).astype(float)
data_character.loc[nan_indices, 'actor_age_movie_released'] = calculated_age

print(f"Missing values after process: {data_character['actor_age_movie_released'].isna().sum()}")

Missing values before process: 158776
Missing values after process: 107185


In [386]:
data_character.loc[767]

Wiki_ID                          15007384
Movie_ID                       /m/03h4h5p
release_date                   1934-05-02
character_name                        NaN
actor_birth_date               1963-11-07
actor_gender                          NaN
actor_height                          NaN
ethnicity_ID                          NaN
actor_name                  Franck Dubosc
actor_age_movie_released            -29.0
character/actor_ID             /m/0bwb9k3
character_ID                          NaN
Actor_ID                       /m/01wlly9
Movie_title                           NaN
release_year                         1934
birth_year                           1963
Movie_Count                           NaN
ethnicity                             NaN
Movie_title                  Le Grand jeu
Name: 767, dtype: object

Remove invalide release and birth years

Some actors are associated with wrong movie! Ex with Franck Dubosc associated with "Le grand jeu" in which he never played and which was released before his birth.

There is no way to know when it it the case in general but we can still remove every characters for which the birth year of the actor is before the release year of the movie. We might remove rows for which a date is wrong but we make sure we remove big incoherence.

In [387]:
valid_data = data_character['actor_age_movie_released'].dropna()
wrong_values = valid_data[(data_character['actor_age_movie_released'] < 0) | 
                          (data_character['actor_age_movie_released'] > 95)]

data_character.drop(wrong_values.index, axis=0, inplace=True)
#Replace 
print(f'There are {wrong_values.shape[0]} actors with negative or 0 age when movie released. They have been deleted')
print(data_character.shape)

There are 23203 actors with negative or 0 age when movie released. They have been deleted
(426151, 19)


### Actor height

In [388]:
wrong_values = data_character[
    (data_character['actor_height'] < 0.8) |
    (data_character['actor_height'] > 2.72)
] #2 m 72 is the world's record for height: Robert Wadlow

data_character.loc[wrong_values.index, 'actor_height'] = pd.NA

print(f"There are {wrong_values.shape[0]} actors with invalid height (<= 0.8 or >= 2.72). These have been replaced with NaNs.")

There are 15 actors with invalid height (<= 0.8 or >= 2.72). These have been replaced with NaNs.


Since there are only a very small number of actors with wrong height format we will just discard them and not try to put it back to the right format

### Adding Movie count

In [389]:
data_character['Movie_Count'] = data_character.groupby('Actor_ID')['Actor_ID'].transform('count')

In [390]:
data_character.drop_duplicates(subset=['Actor_ID'], keep='first').sort_values(by = 'Movie_Count', ascending =False)[:5]

Unnamed: 0,Wiki_ID,Movie_ID,release_date,character_name,actor_birth_date,actor_gender,actor_height,ethnicity_ID,actor_name,actor_age_movie_released,character/actor_ID,character_ID,Actor_ID,Movie_title,release_year,birth_year,Movie_Count,ethnicity,Movie_title.1
519,11717027,/m/02rpz18,,,1908-05-30,M,,/m/041rx,Mel Blanc,,/m/052c_g2,,/m/0c5vh,,,1908,745.0,,Is There a Doctor in the Mouse?
805,30173306,/m/0g59khw,1995-01-06,,1950-06-16,M,1.83,/m/0bpjh3,Mithun Chakraborty,44.0,/m/0gvwn56,,/m/04c636,,1995.0,1950,323.0,,Ab Insaf Hoga
138,9633533,/m/02pml15,1989,Unnikrishnan,1960-05-21,M,1.72,/m/0dryh9k,Mohanlal,28.0,/m/03lzkb6,/m/0h8gsxq,/m/02fbpz,,1989.0,1960,231.0,,Vandanam
2574,22475578,/m/05zj4mp,1994-10-23,Ravishankar,1951-09-07,M,1.78,/m/04mvp8,Mammootty,43.0,/m/0c0581q,/m/0h27xc1,/m/02hkv5,,1968.0,1951,224.0,,Sukrutham
1817,26687336,/m/0bmh28z,2011-01-14,Dharam Singh Dhillon,1935-12-08,M,1.78,,Dharmendra Deol,75.0,/m/0gm2fww,/m/0h5rsrj,/m/02n1gr,,2011.0,1935,217.0,,Yamla Pagla Deewana


In [391]:
valid_data = data_character['Movie_Count'].dropna()
wrong_values = valid_data[(data_character['Movie_Count'] >=540)]

data_character.drop(wrong_values.index, axis=0, inplace=True)
#Replace 
print(f'There are {wrong_values.shape[0]} lines with negative or 0 age when movie released. They have been deleted')
print(data_character.shape)

There are 745 lines with negative or 0 age when movie released. They have been deleted
(425406, 19)


Most productive actor with 744 movies is a voice actor so we will remove it, the next ones are reall very productive actors

### Actors' ethnicity

In [392]:
data_character.describe()

Unnamed: 0,Wiki_ID,actor_height,actor_age_movie_released,Movie_title,release_year,birth_year,Movie_Count,ethnicity
count,425406.0,149174.0,318229.0,0.0,414145.0,319657.0,424889.0,0.0
mean,13803220.0,1.748652,39.445805,,1983.933171,1946.525932,19.494087,
std,10767530.0,0.106206,16.074934,,25.336779,25.84457,29.128303,
min,330.0,0.813,0.0,,1896.0,1804.0,1.0,
25%,3659082.0,1.68,28.0,,1968.0,1927.0,2.0,
50%,11530660.0,1.75,37.0,,1993.0,1950.0,8.0,
75%,23413260.0,1.83,49.0,,2005.0,1967.0,26.0,
max,37501920.0,2.356,95.0,,2012.0,2007.0,323.0,


In [393]:
unique_actors_df = data_character.drop_duplicates(subset=['Actor_ID'], keep='first').copy()

#Extract Unique Ethnicities
unique_ethnicities = unique_actors_df['ethnicity_ID'].dropna().unique()

#Function to Query Wikidata Using Freebase IDs
def query_wikidata_ethnicity(freebase_id):
    freebase_id = freebase_id.replace('"', '\\"')
    query = f"""
    SELECT ?item ?itemLabel ?article
    WHERE {{
      ?item wdt:P646 "{freebase_id}" .
      OPTIONAL {{
        ?article schema:about ?item ;
                 schema:isPartOf <https://en.wikipedia.org/> .
      }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    LIMIT 1
    """
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.addCustomHttpHeader('User-Agent', 'YourAppName/1.0 (your.email@example.com)')
    try:
        results = sparql.query().convert()
        bindings = results['results']['bindings']
        if bindings:
            result = bindings[0]
            wikidata_id = result['item']['value'].split('/')[-1]
            article = result.get('article', {}).get('value', None)
            item_label = result.get('itemLabel', {}).get('value', None)
            return wikidata_id, item_label, article
        else:
            return pd.NA, pd.NA, pd.NA
    except Exception as e:
        print(f"Error querying for ethnicity '{freebase_id}': {e}")
        return pd.NA, pd.NA, pd.NA

#Loop Over Ethnicities
ethnicity_data = []

for freebase_id in unique_ethnicities:
    wikidata_id, item_label, article = query_wikidata_ethnicity(freebase_id)
    count = unique_actors_df[unique_actors_df['ethnicity_ID'] == freebase_id].shape[0]
    ethnicity_data.append({
        'ethnicity_ID': freebase_id,
        'wikidata_id': wikidata_id,
        'ethnicity_label': item_label,
        'corresponding_ethnicity': article,
        'count': count
    })

ethnicity_df = pd.DataFrame(ethnicity_data)

In [394]:
ethnicity_df.sort_values(by= "count", ascending=False)[:20]

Unnamed: 0,ethnicity_ID,wikidata_id,ethnicity_label,corresponding_ethnicity,count
1,/m/0x67,Q49085,African Americans,https://en.wikipedia.org/wiki/African_Americans,1464
8,/m/0dryh9k,Q862086,Indians,https://en.wikipedia.org/wiki/Indian_people,721
3,/m/041rx,Q7325,Jewish people,https://en.wikipedia.org/wiki/Jews,703
34,/m/02ctzb,Q235155,White people,https://en.wikipedia.org/wiki/White_people,492
24,/m/02w7gg,Q42406,English people,https://en.wikipedia.org/wiki/English_people,383
48,/m/07hwkr,Q49078,White Americans,https://en.wikipedia.org/wiki/White_Americans,237
4,/m/033tf_,Q1075293,Irish Americans,https://en.wikipedia.org/wiki/Irish_Americans,196
0,/m/044038p,,,,145
19,/m/0xnvg,Q974693,Italian Americans,https://en.wikipedia.org/wiki/Italian_Americans,139
75,/m/0d7wh,Q842438,British,https://en.wikipedia.org/wiki/British_people,122


In [395]:
unique_actors_df[unique_actors_df['ethnicity_ID']=='/m/044038p'].sort_values(by= 'Movie_Count', ascending=False)[:10]

Unnamed: 0,Wiki_ID,Movie_ID,release_date,character_name,actor_birth_date,actor_gender,actor_height,ethnicity_ID,actor_name,actor_age_movie_released,character/actor_ID,character_ID,Actor_ID,Movie_title,release_year,birth_year,Movie_Count,ethnicity,Movie_title.1
384,5894429,/m/0fc8w8,2007-09-14,Jack,1929-12-13,M,1.778,/m/044038p,Christopher Plummer,77.0,/m/0jwjks,/m/0h1yq64,/m/01ycbq,,2007,1929,93.0,,Closing the Ring
6311,19115982,/m/04jg1sc,1999,Hugh Sanford,1946-12-17,M,1.778,/m/044038p,Eugene Levy,52.0,/m/0cs3c4v,/m/0h5hp6h,/m/028k57,,2012,1946,59.0,,The Secret Life of Girls
11000,6201527,/m/0fwhh4,2004,,1949-07-27,M,1.85,/m/044038p,Maury Chaykin,54.0,/m/0cg9937,,/m/07csf4,,1990,1949,56.0,,Intern Academy
35948,26340083,/m/0bbxwx5,2010-07-27,Batman,1956-08-12,M,1.8,/m/044038p,Bruce Greenwood,53.0,/m/0bd46jp,/m/01d5g,/m/01yfm8,,2005,1956,50.0,,Batman: Under the Red Hood
4502,1598832,/m/05fkhv,1985-08-02,State Trooper,1950-10-31,M,1.88,/m/044038p,John Candy,34.0,/m/0bykgxx,/m/0h1jnrq,/m/0mfj2,,1985,1950,48.0,,Sesame Street presents Follow That Bird
773,11692389,/m/02rp4qv,2005-12-24,,1942-03-30,M,1.753,/m/044038p,Kenneth Welsh,63.0,/m/04htvb8,,/m/06rn5d,,2005,1942,46.0,,The Snow Queen
5595,6691690,/m/0ghjvz,2006-08-06,Peter Campbell,1966-02-12,M,1.8,/m/044038p,Lochlyn Munro,40.0,/m/03jsz_w,/m/0l5dx1s,/m/03hv_6,,2006,1966,39.0,,The Tooth Fairy
39503,27486082,/m/04j25wc,1990,,1956-05-09,F,1.75,/m/044038p,Wendy Crewson,33.0,/m/04j25wm,,/m/02f5jr,,1968,1956,36.0,,Getting Married in Buffalo Jump
11220,4018828,/m/0bcs_f,2006-04-28,Todd Mallory,1970-05-04,M,1.892,/m/044038p,Will Arnett,35.0,/m/0k2rmc,/m/0h2bbk1,/m/03q43g,,2000,1970,35.0,,RV
19019,2077360,/m/06kl78,1996-05-17,Catherine Ballard,1966-05-12,F,1.7,/m/044038p,Deborah Kara Unger,30.0,/m/0jwrdz,/m/0gz34r1,/m/06s7y4,,1997,1966,33.0,,Crash


Looking at the actors' webpage we can identify the ethnicity '/m/044038p' as Canadian

In [396]:
ethnicity_df.loc[ethnicity_df['ethnicity_ID'] == '/m/044038p', 'ethnicity_label'] = 'Canadian'

Add the ethnicity_label in the character dataframe

In [397]:
data_character = data_character.merge(
    ethnicity_df[['ethnicity_ID', 'ethnicity_label']],
    on='ethnicity_ID',
    how='left'
)
data_character.rename(columns={'ethnicity_label': 'ethnicity'}, inplace=True)


# Adding a column for succes_rating

In [398]:
data_success['BoxOfficeRank'] = data_success['Net_revenue'].rank(pct=True)
data_success['RatingRank'] = data_success['rating'].rank(pct=True)

weight_box_office = 0.5
weight_imdb = 0.5

data_success['SuccessMetric'] = (weight_box_office * data_success['BoxOfficeRank']) + (weight_imdb * data_success['RatingRank'])

# Save cleaned data

In [399]:
data_success.to_csv('./data/clean_data_success.csv', index=0)
data_character.to_csv('./data/clean_data_character.csv', index=0)
data_original.to_csv('./data/clean_data_original.csv', index=0)
ethnicity_df.to_csv('./data/ethnicity_labels.csv', index=0)