# Director choice algorithm

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [10]:
#genre = 'Drama'

In [71]:
director_df = pd.read_csv('directors.csv')
data = pd.read_csv('movies_cleaned_dataset.csv')

In [72]:
from scipy.stats import zscore

# Filling missing values in specific columns with the mean of those columns
data['Inflation_adjusted_profit'].fillna(data['Inflation_adjusted_profit'].mean(), inplace=True)
data['averageRating'].fillna(data['averageRating'].mean(), inplace=True)
data['Oscar_Wins'].fillna(data['Oscar_Wins'].mean(), inplace=True)
data['Nominations'].fillna(data['Nominations'].mean(), inplace=True)

# Capping extreme values in the 'Inflation adjusted profit' column
# This reduces the impact of outliers by setting a threshold (90th percentile here)
# Values above this threshold are set to the threshold value itself
cap_threshold = data['Inflation_adjusted_profit'].quantile(0.90) 
data['capped_profit'] = data['Inflation_adjusted_profit'].clip(upper=cap_threshold)

# Calculating the Z-scores (standard scores) for the capped profit, average rating, Oscar wins, and nominations
# This standardizes these features to have a mean of 0 and a standard deviation of 1, aiding in comparison
z_capped_profit = zscore(data['capped_profit'])
z_rating = zscore(data['averageRating'])
z_oscars = zscore(data['Oscar_Wins'])
z_nominations = zscore(data['Nominations'])

# Assigning weights to each of these standardized features
weight_capped_profit = 0.3
weight_rating = 0.35
weight_oscars = 0.175
weight_nominations = 0.175

# Calculating a 'Successful' score based on these weighted features
# This is a composite metric considering profit, rating, Oscars, and nominations
data['Successful'] = (
    weight_capped_profit * z_capped_profit +
    weight_rating * z_rating +
    weight_oscars * z_oscars +
    weight_nominations * z_nominations)

# Normalizing the 'Successful' score to a 0-10 scale
# This makes the score more interpretable and standardized
data['Successful'] = round((data['Successful'] - data['Successful'].min()) / (data['Successful'].max() - data['Successful'].min()) * 10,1)


In [73]:
data

Unnamed: 0,Movie_name,Movie_box_office_revenue,Movie_runtime,Movie_genres,Main_genre,Main_language,Top_genres,Main_country,Main_continent,Plot_summary,...,Estimated_Budget,IMDb,Oscar_Wins,Nominations,Profit,Inflation_adjusted_profit,nconst,Director_name,capped_profit,Successful
0,Ghosts of Mars,14010832,98.0,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","['Thriller', 'Science Fiction', 'Horror', 'Adv...",English Language,Thriller,United States of America,USA,"Set in the second half of the 22nd century, th...",...,28000000,tt0228333,0,0,-13989168,-1.870336e+07,nm0000118,John Carpenter,-1.870336e+07,3.9
1,White Of The Eye,0,110.0,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...","['Thriller', 'Erotic thriller', 'Psychological...",English Language,Thriller,United Kingdom,Europe,A series of murders of rich young women throug...,...,0,tt0094320,0,0,0,0.000000e+00,nm0131910,Donald Cammell,0.000000e+00,4.5
2,A Woman in Flames,0,106.0,"{""/m/07s9rl0"": ""Drama""}",Drama,German Language,Other,Germany,Europe,"Eva, an upper class housewife, becomes frustra...",...,0,tt0083949,0,0,0,0.000000e+00,nm0885554,Robert van Ackeren,0.000000e+00,4.4
3,The Sorcerer's Apprentice,0,86.0,"{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","['Family Film', 'Fantasy', 'Adventure', 'World...",English Language,Family Film,South Africa,Africa,"Every hundred years, the evil Morgana returns...",...,0,,0,0,0,0.000000e+00,,Unknown,0.000000e+00,4.5
4,Little city,0,93.0,"{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","['Romantic comedy', 'Ensemble Film', 'Comedy-d...",English Language,Drama,United States of America,USA,"Adam, a San Francisco-based artist who works a...",...,0,tt0119548,0,0,0,0.000000e+00,nm0070142,Roberto Benabib,0.000000e+00,4.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42150,The Ghost Train,0,82.0,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th...","['Crime Fiction', 'Thriller', 'Comedy', 'Super...",English Language,Thriller,United Kingdom,Europe,{{plot}} The film opens with a Great Western e...,...,0,tt0033660,0,0,0,0.000000e+00,nm0285962,Walter Forde,0.000000e+00,4.5
42151,Mermaids: The Body Found,0,120.0,"{""/m/07s9rl0"": ""Drama""}",Drama,English Language,Other,United States of America,USA,Two former National Oceanic Atmospheric Admini...,...,0,,0,0,0,0.000000e+00,,Unknown,0.000000e+00,4.5
42152,Knuckle,0,96.0,"{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...","['Biographical film', 'Drama', 'Documentary']",English Language,Drama,Ireland,,{{No plot}} This film follows 12 years in the ...,...,0,tt1606259,0,0,0,0.000000e+00,nm3817974,Ian Palmer,0.000000e+00,4.6
42153,The Super Dimension Fortress Macross II: Lover...,0,150.0,"{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...","['Science Fiction', 'Japanese Movies', 'Advent...",Japanese Language,Science Fiction,Japan,Asia,"The story takes place in the year 2092,The Sup...",...,0,,0,0,0,0.000000e+00,,Unknown,0.000000e+00,4.5


In [74]:
drama = ['Drama', 'Crime', 'Thriller']
family = ['Family Film', 'Comedy', 'Musical']
action_adventure = ['Action', 'Action/Adventure']
fiction = ['Science Fiction']
horror = ['Horror']
other = ['Short Film', 'Other']

def map_genre(genre):
    if genre in drama:
        return 'Drama'
    elif genre in family:
        return 'Family'
    elif genre in action_adventure:
        return 'Action/Adventure'
    elif genre in fiction:
        return 'Fiction'
    elif genre in horror:
        return 'Horror'
    else:
        return 'Other'


In [75]:
data['Top_genres'] = data['Top_genres'].apply(map_genre)
data = data.drop(columns= ['Movie_runtime', 'Inflation_adjusted_profit', 'Main_language', 'Oscar_Wins', 'Nominations', 'averageRating', 'Main_country', 'Main_continent', 'Plot_summary', 'numVotes', 'Estimated_Budget'])
data = data.drop(columns=['Movie_box_office_revenue', 'Main_genre', 'Movie_genres', 'IMDb', 'Profit'])

In [76]:
#data = data[data['Top_genres'] == genre]
#data = data.drop(columns=['Top_genres'])


In [77]:
director_df

Unnamed: 0,nconst,director_name,num_movies,movies,birth_year,death_year
0,nm0000005,Ingmar Bergman,85,"['tt0038468', 'tt0038675', 'tt0039834', 'tt004...",1918,2007
1,nm0000008,Marlon Brando,1,['tt0055257'],1924,2004
2,nm0000009,Richard Burton,1,['tt0062898'],1925,1984
3,nm0000010,James Cagney,1,['tt0050964'],1899,1986
4,nm0000018,Kirk Douglas,1,['tt0073559'],1916,2020
...,...,...,...,...,...,...
686037,nm9993679,Art Jones,1,['tt8744074'],\N,\N
686038,nm9993694,Chinmay Mishra,2,"['tt18361688', 'tt18687502']",\N,\N
686039,nm9993696,Ibrahim-Aloduley,1,['tt8744160'],\N,\N
686040,nm9993708,Eli Bevins,9,"['tt11702702', 'tt11753904', 'tt11772812', 'tt...",\N,\N


In [78]:
data2 = data.merge(director_df, left_on='nconst', right_on='nconst', how='left')

In [79]:
data2

Unnamed: 0,Movie_name,Release_Date,tconst,nconst,Director_name,capped_profit,Successful,director_name,num_movies,movies,birth_year,death_year
0,Ghosts of Mars,2001,tt0228333,nm0000118,John Carpenter,-1.870336e+07,3.9,John Carpenter,32.0,"['tt0056410', 'tt0064383', 'tt0064384', 'tt006...",1948,\N
1,White Of The Eye,1987,tt0094320,nm0131910,Donald Cammell,0.000000e+00,4.5,Donald Cammell,5.0,"['tt0066214', 'tt0075931', 'tt0094320', 'tt011...",1934,1996
2,Little city,1997,tt0119548,nm0070142,Roberto Benabib,0.000000e+00,4.4,Roberto Benabib,1.0,['tt0119548'],\N,\N
3,Henry V,1989,tt0097499,nm0000110,Kenneth Branagh,2.217323e+06,5.5,Kenneth Branagh,23.0,"['tt0097499', 'tt0101669', 'tt0105130', 'tt010...",1960,\N
4,Baby Boy,2001,tt0255819,nm0005436,John Singleton,1.160929e+07,4.7,John Singleton,19.0,"['tt0101507', 'tt0107840', 'tt0113305', 'tt012...",1968,2019
...,...,...,...,...,...,...,...,...,...,...,...,...
16110,Gopi Kishan,1994,tt0109922,nm0240881,Mukesh Duggal,0.000000e+00,4.4,Mukesh Duggal,1.0,['tt0109922'],\N,1997
16111,Eşrefpaşalılar,2010,,,Unknown,2.005960e+06,4.5,,,,,
16112,The Ghost Train,1941,tt0033660,nm0285962,Walter Forde,0.000000e+00,4.5,Walter Forde,52.0,"['tt0019542', 'tt0019566', 'tt0020605', 'tt002...",1898,1984
16113,Knuckle,2011,tt1606259,nm3817974,Ian Palmer,0.000000e+00,4.6,Ian Palmer,1.0,['tt1606259'],\N,\N


In [80]:
data2['birth_year'] = data2['birth_year'].apply(lambda x: '0' if (x == '\\N' or x == 'NaN') else x)

In [81]:
data2 = data2.dropna(subset=['birth_year', 'Release_Date'])
data2['Release_Date'] = data2['Release_Date'].astype(int)
data2['birth_year'] = data2['birth_year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Release_Date'] = data2['Release_Date'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['birth_year'] = data2['birth_year'].astype(int)


In [82]:
data2['Director_age'] = data2['Release_Date'] - data2['birth_year']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Director_age'] = data2['Release_Date'] - data2['birth_year']


In [83]:
data2 = data2[data2['Director_age'] < 110]
data2 = data2[data2['Director_age'] > 0]
data2 = data2.drop(columns = 'director_name')
data2 

Unnamed: 0,Movie_name,Release_Date,tconst,nconst,Director_name,capped_profit,Successful,num_movies,movies,birth_year,death_year,Director_age
0,Ghosts of Mars,2001,tt0228333,nm0000118,John Carpenter,-1.870336e+07,3.9,32.0,"['tt0056410', 'tt0064383', 'tt0064384', 'tt006...",1948,\N,53
1,White Of The Eye,1987,tt0094320,nm0131910,Donald Cammell,0.000000e+00,4.5,5.0,"['tt0066214', 'tt0075931', 'tt0094320', 'tt011...",1934,1996,53
3,Henry V,1989,tt0097499,nm0000110,Kenneth Branagh,2.217323e+06,5.5,23.0,"['tt0097499', 'tt0101669', 'tt0105130', 'tt010...",1960,\N,29
4,Baby Boy,2001,tt0255819,nm0005436,John Singleton,1.160929e+07,4.7,19.0,"['tt0101507', 'tt0107840', 'tt0113305', 'tt012...",1968,2019,33
5,Rudo y Cursi,2008,tt0405393,nm0190860,Carlos Cuarón,1.160929e+07,4.8,11.0,"['tt0178027', 'tt0304246', 'tt0326614', 'tt034...",1966,\N,42
...,...,...,...,...,...,...,...,...,...,...,...,...
16105,Inspiration,1931,tt0022001,nm0113284,Clarence Brown,0.000000e+00,4.5,52.0,"['tt0011246', 'tt0011387', 'tt0012183', 'tt001...",1890,1987,41
16107,Mirage,1972,tt0068955,nm0324162,Armando Robles Godoy,0.000000e+00,4.6,9.0,"['tt0061622', 'tt0065798', 'tt0068955', 'tt018...",1923,2010,49
16108,The Flying Serpent,1946,tt0038531,nm0627864,Sam Newfield,0.000000e+00,4.2,324.0,"['tt0017643', 'tt0020631', 'tt0020674', 'tt002...",1899,1964,47
16109,Guilty as Sin,1993,tt0107057,nm0001486,Sidney Lumet,1.160929e+07,4.6,154.0,"['tt0041039', 'tt0042098', 'tt0045458', 'tt004...",1924,2011,69


In [84]:
data2 = data2.drop(columns = ['birth_year', 'death_year', 'movies'])
data2['num_movies'] = data2['num_movies'].astype(int)

In [99]:
data2 = data2[data2['num_movies'] < 100]


In [None]:
data2.to_csv('director_processed.csv', index=False)

# ML (regression)