In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Reading the data from a CSV file into a pandas DataFrame
df_avec_successful = pd.read_csv('../data/clean/cleaned_inflation_adjusted.csv')

# Filling missing values in specific columns with the mean of those columns
df_avec_successful['Inflation_adjusted_profit'].fillna(df_avec_successful['Inflation_adjusted_profit'].mean(), inplace=True)
df_avec_successful['averageRating'].fillna(df_avec_successful['averageRating'].mean(), inplace=True)
df_avec_successful['Oscar_Wins'].fillna(df_avec_successful['Oscar_Wins'].mean(), inplace=True)
df_avec_successful['Nominations'].fillna(df_avec_successful['Nominations'].mean(), inplace=True)

# Capping extreme values in the 'Inflation adjusted profit' column
# This reduces the impact of outliers by setting a threshold (90th percentile here)
# Values above this threshold are set to the threshold value itself
cap_threshold = df_avec_successful['Inflation_adjusted_profit'].quantile(0.90) 
df_avec_successful['capped_profit'] = df_avec_successful['Inflation_adjusted_profit'].clip(upper=cap_threshold)

# Calculating the Z-scores (standard scores) for the capped profit, average rating, Oscar wins, and nominations
# This standardizes these features to have a mean of 0 and a standard deviation of 1, aiding in comparison
z_capped_profit = zscore(df_avec_successful['capped_profit'])
z_rating = zscore(df_avec_successful['averageRating'])
z_oscars = zscore(df_avec_successful['Oscar_Wins'])
z_nominations = zscore(df_avec_successful['Nominations'])

# Assigning weights to each of these standardized features
weight_capped_profit = 0.3
weight_rating = 0.35
weight_oscars = 0.175
weight_nominations = 0.175

# Calculating a 'Successful' score based on these weighted features
# This is a composite metric considering profit, rating, Oscars, and nominations
df_avec_successful['Successful'] = (
    weight_capped_profit * z_capped_profit +
    weight_rating * z_rating +
    weight_oscars * z_oscars +
    weight_nominations * z_nominations)

# Normalizing the 'Successful' score to a 0-10 scale
# This makes the score more interpretable and standardized
df_avec_successful['Successful'] = round((df_avec_successful['Successful'] - df_avec_successful['Successful'].min()) / (df_avec_successful['Successful'].max() - df_avec_successful['Successful'].min()) * 10,1)

# Sorting the DataFrame based on the 'Successful' score in descending order
df_avec_successful = df_avec_successful.sort_values(by='Successful', ascending=False)

# Displaying the top 300 rows of the filtered DataFrame
df_avec_successful.head(300)


Unnamed: 0,Movie_name,Release_Date,Movie_box_office_revenue,Movie_runtime,Main_genre,Main_language,Main_country,Plot_summary,tconst,averageRating,...,Movie_languages,Movie_countries,Movie_genres,Estimated_Budget,Oscar_Wins,Nominations,Profit,Inflation_adjusted_profit,capped_profit,Successful
21164,Titanic,1997,2185372302,194.0,Tragedy,Italian Language,United States of America,"In 1996, treasure hunter Brock Lovett and his...",tt0120338,7.9,...,"{""/m/02bjrlw"": ""Italian Language"", ""/m/02h40lc...","{""/m/09c7w0"": ""United States of America""}","{""/m/0fx2s"": ""Tragedy"", ""/m/04xvh5"": ""Costume ...",200000000,11,14,1985372302,2.928104e+09,1.771540e+07,10.0
35423,The Lord of the Rings: The Return of the King,2003,1119929521,250.0,Fantasy Adventure,Old English language,United States of America,"Gandalf, Aragorn, Legolas, Gimli, Théoden, Ga...",tt0167260,9.0,...,"{""/m/05p2d"": ""Old English language"", ""/m/02h40...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0hj3n2k"": ""Fantasy Adventure"", ""/m/03k9fj...",94000000,11,11,1025929521,1.320270e+09,1.771540e+07,9.9
21580,Ben-Hur,1959,146900000,219.0,History,English Language,United States of America,"In AD 26, Judah Ben-Hur is a wealthy prince a...",tt0052618,8.1,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03g3w"": ""History"", ""/m/02l7c8"": ""Romance ...",15000000,11,12,131900000,1.071201e+09,1.771540e+07,9.8
22248,West Side Story,1961,43700000,152.0,Crime Fiction,English Language,United States of America,Although the plot summary here is divided into...,tt0055614,7.6,...,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...","{""/m/09c7w0"": ""United States of America""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/04t36"": ""Mus...",6000000,10,11,37700000,2.985766e+08,1.771540e+07,9.2
24962,Gone with the Wind,1939,400000000,234.0,Film adaptation,English Language,United States of America,The film opens on a large cotton plantation c...,tt0031381,8.2,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/060__y"": ""Film adaptation"", ""/m/04xvh5"": ...",4000000,8,13,396000000,6.740385e+09,1.771540e+07,9.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19879,The Reader,2008,108901967,119.0,Tragedy,Greek Language,United States of America,"The Reader begins in 1995 Berlin, where Michae...",tt0976051,7.6,...,"{""/m/0349s"": ""Greek Language"", ""/m/04h9h"": ""La...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0fx2s"": ""Tragedy"", ""/m/04xvlr"": ""Period p...",32000000,1,5,76901967,8.455767e+07,1.771540e+07,5.7
20732,The Descendants,2011,177243185,114.0,Comedy-drama,English Language,United States of America,Matt King is a Honolulu-based lawyer and the ...,tt1033575,7.3,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01t_vv"": ""Comedy-drama""}",20000000,1,5,157243185,1.654898e+08,1.771540e+07,5.7
33151,Dead Poets Society,1989,235860116,129.0,Ensemble Film,English Language,United States of America,"Neil Perry , Todd Anderson , Knox Overstreet ,...",tt0097165,8.1,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0hj3n0w"": ""Ensemble Film"", ""/m/09tvt3"": ""...",16400000,1,4,219460116,4.190977e+08,1.771540e+07,5.7
25146,Crazy Heart,2009,47405566,112.0,Music,English Language,United States of America,"Otis ""Bad"" Blake is a 57-year-old alcoholic s...",tt1263670,7.2,...,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...","{""/m/09c7w0"": ""United States of America""}","{""/m/04rlf"": ""Music"", ""/m/02l7c8"": ""Romance Fi...",7000000,2,3,40405566,4.458652e+07,1.771540e+07,5.7


In [2]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Assuming df_avec_successful is your DataFrame
# Read the data (replace with your actual file path)
df_avec_successful = pd.read_csv('../data/clean/cleaned_inflation_adjusted.csv')

# Fill missing values with the mean of each column
df_avec_successful['Inflation adjusted profit'].fillna(df_avec_successful['Inflation adjusted profit'].mean(), inplace=True)
df_avec_successful['averageRating'].fillna(df_avec_successful['averageRating'].mean(), inplace=True)
df_avec_successful['Oscar Wins'].fillna(df_avec_successful['Oscar Wins'].mean(), inplace=True)
df_avec_successful['Nominations'].fillna(df_avec_successful['Nominations'].mean(), inplace=True)

#Cap extreme values for 'Inflation adjusted profit'
#Cap Extreme Values
#This means setting a threshold above which all values are considered equal. This approach can reduce the impact of outliers without the complications of logarithmic transformations.
cap_threshold = df_avec_successful['Inflation adjusted profit'].quantile(0.90) 
df_avec_successful['capped_profit'] = df_avec_successful['Inflation adjusted profit'].clip(upper=cap_threshold)

# Calculate Z-scores for each component
z_capped_profit = zscore(df_avec_successful['capped_profit'])
z_rating = zscore(df_avec_successful['averageRating'])
z_oscars = zscore(df_avec_successful['Oscar Wins'])
z_nominations = zscore(df_avec_successful['Nominations'])

weight_capped_profit = 0.3
weight_rating = 0.35
weight_oscars = 0.175
weight_nominations = 0.175

df_avec_successful['Successful'] = (
    weight_capped_profit * z_capped_profit +
    weight_rating *z_rating +
    weight_oscars *z_oscars +
    weight_nominations * z_nominations)

# Normalize the 'Successful' score to a 0-10 scale
df_avec_successful['Successful'] = round((df_avec_successful['Successful'] - df_avec_successful['Successful'].min()) / (df_avec_successful['Successful'].max() - df_avec_successful['Successful'].min()) * 10,1)

# Sort the DataFrame
df_avec_successful = df_avec_successful.sort_values(by='Successful', ascending=False)

# Display the DataFrame
o = df_avec_successful[df_avec_successful['Main genre'] == ' Thriller' ]
o.dropna()
print(len(o))
o.head(300)
#df_avec_successful.head(300)


4965


Unnamed: 0,Movie name,Release Date,Movie box office revenue,Movie runtime,Main genre,Main language,Main country,Plot summary,tconst,averageRating,...,Movie languages,Movie countries,Movie genres,Estimated Budget,Oscar Wins,Nominations,Profit,Inflation adjusted profit,capped_profit,Successful
21179,Slumdog Millionaire,2008,377910544,120.0,Thriller,Hindi Language,United Kingdom,"In Mumbai in 2006, eighteen-year-old Jamal Mal...",tt1010048,8.0,...,"{""/m/03k50"": ""Hindi Language"", ""/m/064_8sq"": ""...","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0lsxr"": ""Crime F...",15000000,8,10,362910544,3.990388e+08,1.771540e+07,8.6
19534,The Silence of the Lambs,1991,272742922,118.0,Thriller,English Language,United States of America,Clarice Starling is pulled from her training ...,tt0102926,8.6,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0hn10"": ""LGBT"", ...",19000000,5,7,253742922,4.410705e+08,1.771540e+07,7.5
30285,Inception,2010,825532764,148.0,Thriller,French Language,United States of America,"Former dream architect Dominick ""Dom"" Cobb a...",tt1375666,8.8,...,"{""/m/064_8sq"": ""French Language"", ""/m/03_9r"": ...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",160000000,4,8,665532764,7.225485e+08,1.771540e+07,7.4
34979,The French Connection,1971,51700000,104.0,Thriller,French Language,United States of America,The film revolves around the smuggling of narc...,tt0067116,7.7,...,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0lsxr"": ""Crime F...",1800000,5,8,49900000,2.917424e+08,1.771540e+07,7.4
16675,No Country for Old Men,2007,171627166,123.0,Thriller,English Language,United States of America,"West Texas in June 1980 is desolate, wide open...",tt0477348,8.2,...,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0lsxr"": ""Crime F...",25000000,4,8,146627166,1.674137e+08,1.771540e+07,7.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18781,Ocean's Thirteen,2007,311312624,121.0,Thriller,French Language,United States of America,"Reuben Tishkoff , in an attempt to legitimize ...",tt0496806,6.9,...,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0lsxr"": ""Crime F...",85000000,0,0,226312624,2.583957e+08,1.771540e+07,4.6
18793,Blood Simple,1984,4218701,96.0,Thriller,English Language,United States of America,"Julian Marty , the owner of a Texas bar, suspe...",tt0086979,7.6,...,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/02wtdps"": ""Crime...",1500000,0,0,2718701,6.195554e+06,6.195554e+06,4.6
35479,Oru CBI Diary Kurippu,1988,0,137.0,Thriller,Malayalam Language,India,The movie opens with DySP Prabhakara Varma ob...,tt0271694,8.1,...,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/01jfsb"": ""Thriller"", ""/m/02n4kr"": ""Myster...",0,0,0,0,0.000000e+00,0.000000e+00,4.6
18804,The Cabin in the Woods,2011,65902967,95.0,Thriller,Japanese Language,United States of America,Technicians Gary Sitterson and Steve Hadley p...,tt1259521,7.0,...,"{""/m/03_9r"": ""Japanese Language"", ""/m/02h40lc""...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/03npn"": ""Horror""...",30000000,0,0,35902967,3.778591e+07,1.771540e+07,4.6


In [3]:
df_avec_successful.describe() 

Unnamed: 0,Release Date,Movie box office revenue,Movie runtime,averageRating,numVotes,Estimated Budget,Oscar Wins,Nominations,Profit,Inflation adjusted profit,capped_profit,Successful
count,35612.0,35612.0,31161.0,35612.0,19876.0,35612.0,35612.0,35612.0,35612.0,35612.0,35612.0,35612.0
mean,1838.949624,10990320.0,134.8571,6.260364,31618.25,3839784.0,0.03642,0.092328,7150540.0,13842000.0,1955330.0,4.177971
std,518.592698,57761410.0,6113.649,0.794563,110707.0,16022930.0,0.353489,0.799779,48186870.0,96046370.0,10999640.0,0.386082
min,0.0,0.0,0.3,1.3,10.0,0.0,0.0,0.0,-225000000.0,-273909600.0,-273909600.0,0.0
25%,1961.0,0.0,88.0,6.260364,926.75,0.0,0.0,0.0,0.0,0.0,0.0,4.1
50%,1991.0,0.0,96.0,6.260364,3067.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1
75%,2005.0,0.0,110.0,6.5,14419.5,0.0,0.0,0.0,0.0,129555.1,129555.1,4.4
max,2014.0,2782275000.0,1079281.0,9.3,2816055.0,380000000.0,11.0,14.0,2545275000.0,6822472000.0,17715400.0,10.0


In [4]:
def get_feature_names(column_transformer):
    """
    Get feature names from a fitted ColumnTransformer.
    """
    feature_names = []
    
    for transformer_name, transformer, original_features in column_transformer.transformers:
        if transformer_name != 'remainder':
            if hasattr(transformer, 'get_feature_names'):
                # For transformers with a get_feature_names method
                names = transformer.get_feature_names(original_features)
                feature_names.extend(names)
            else:
                # For transformers without a get_feature_names method
                feature_names.extend(original_features)
        else:
            # Handle the 'remainder' case (e.g., when remainder='passthrough')
            if hasattr(transformer, 'get_feature_names'):
                feature_names.extend(transformer.get_feature_names(original_features))
            else:
                feature_names.extend(original_features)

    return feature_names



In [5]:
def get_transformer_feature_names(column_transformer):
    """
    Get feature names from a fitted ColumnTransformer.
    """
    output_features = []

    for name, pipe, features in column_transformer.transformers_:
        if name == 'remainder':
            # If the remainder is a passthrough, its feature names are the same as the column names
            if pipe == 'passthrough':
                output_features.extend(features)
            continue

        # For transformers with a get_feature_names_out method
        if hasattr(pipe, 'get_feature_names_out'):
            transformer_features = pipe.get_feature_names_out(features)
        else:
            transformer_features = features

        output_features.extend(transformer_features)

    return output_features

ML

In [6]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb 

# Load and prepare the data
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV


# Assuming 'data' is your DataFrame
data = df_avec_successful
#data = data.dropna()  # target



X = data.drop('Successful', axis=1)
y = data['Successful']  # target


#preprocessing steps for both categorical and numeric data. Categorical features are filled with a
# placeholder value for any missing data and then one-hot encoded. Numeric features are imputed with their
#  mean and then standardized. This transformed data is then used to train the RandomForestRegressor.
#  Make sure to adjust the categorical_columns and numeric_columns lists to include all relevant features from your dataset.
# Selecting categorical and numeric columns

categorical_columns2 = ['Main language', 'Main country']  # update as needed
numeric_columns = ['Movie runtime']  # update as needed

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # standardizing data
])

# Bundle preprocessing for numerical and categorical data
preprocessorr = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns2)
    ])
#without numeric(runtime)

preprocessor = ColumnTransformer(
    transformers=[
       
        ('cat', categorical_transformer, categorical_columns2)
    ])#only with texts






# Get a list of unique genres
top_genres = data['Main genre'].value_counts().head(3).index

# Analysis for each genre


#OPTIMISATION
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

param_grid_catboost = {
    'classifier__iterations': [100, 500, 1000],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__depth': [4, 6, 10],
    #'classifier__l2_leaf_reg': [1, 3, 5]
}

for genre in top_genres:
    print("------------------------------------------------------------------------------------------")
    print("------------------------------------------------------------------------------------------")

    genre_data = data[data['Main genre'] == genre]
    print(len(genre_data))
    print(f"Analyzing genre: {genre} - Data Points: {len(genre_data)}")
    # Filter movies with success score greater than 7.5

    successful_movies = genre_data[genre_data['Successful'] >= 6.5]
   
    # Filter movies with success score less than 4
    less_successful_movies = genre_data[genre_data['Successful'] < 5]

    # Calculate the average runtime for successful and less successful movies
    avg_runtime_successful = successful_movies['Movie runtime'].mean()
    avg_runtime_less_successful = less_successful_movies['Movie runtime'].mean()
    #distance = abs(avg_runtime_successful - avg_runtime_less_successful)

    # Determine the optimal runtime
    # Optimal runtime is closer to successful average and maintains at least half of the distance from the less successful average
    #optimal_runtime = avg_runtime_successful - distance / 2 if avg_runtime_successful > avg_runtime_less_successful else avg_runtime_successful + distance / 2

    print(f"Optimal Runtime: {avg_runtime_successful:.2f} minutes")
    #print(f"Average Runtime for Successful Movies: {avg_runtime_successful:.2f} minutes")
    #print(f"Average Runtime for Less Successful Movies: {avg_runtime_less_successful:.2f} minutes")


    if len(genre_data) >30:
        X_genre = genre_data[categorical_columns2]
        y_genre = genre_data['Successful']

        # Split the data for each genre
        X_train, X_test, y_train, y_test = train_test_split(X_genre, y_genre, test_size=0.1, random_state=42)


        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier',  CatBoostRegressor(random_state=42,verbose=0))])
                        
        #grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)#randomforest tunning HP method
        grid_search = GridSearchCV(clf, param_grid_catboost, cv=3, scoring='neg_mean_squared_error')#catboost tunning HP method
        #used to find the best hyperparameters for the RandomForestRegressor

        # Fit the model
        grid_search.fit(X_train, y_train)
        #clf.fit(X_train, y_train)

        #Best model
        best_clf = grid_search.best_estimator_
        column_transformer = best_clf.named_steps['preprocessor']
        feature_names = get_transformer_feature_names(column_transformer)

        #RF avec 2
        #CatBoostRegressor avec 2

        #clf.fit(X_train, y_train)
        y_pred = best_clf.predict(X_test)
        #y_pred = clf.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mse ** 0.5
        r2 = r2_score(y_test, y_pred)

        print(f"MSE: {mse}, RMSE: {rmse}, R²: {r2}")

        # Get feature names correctly
        feature_importances = best_clf.named_steps['classifier'].feature_importances_
        print(len(feature_names))
        print(len(feature_importances))



        if len(feature_names) == len(feature_importances):
          

            feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
            sorted_feature_importance = feature_importance_df.sort_values(by='Importance', ascending=False)
        
            # Top 3 languages
            top_languages = sorted_feature_importance[(sorted_feature_importance['Feature'].str.contains('Main language'))& 
                                                (~sorted_feature_importance['Feature'].str.contains('_missing')) ].head(4)

            print(f"Top 4 languages for {genre}:\n{top_languages}")

            # Top 4 des genres
            top_country = sorted_feature_importance[(sorted_feature_importance['Feature'].str.contains('Main country')) & 
                                                (~sorted_feature_importance['Feature'].str.contains('_missing'))].head(4)
            print(f"Top 4 Country for {genre}:\n{top_country}")

        else:
            print(" feature names and feature importances not same size.")

    else:
        print(f"Not enough data to analyze genre: {genre}")
    

------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------
4965
Analyzing genre:  Thriller - Data Points: 4965
Optimal Runtime: 140.57 minutes
MSE: 0.16710044646271255, RMSE: 0.40877921481248597, R²: 0.010223713389151623
154
154
Top 3 languages for  Thriller:
                            Feature  Importance
20  Main language_ English Language   21.440337
24   Main language_ French Language   15.517444
30    Main language_ Hindi Language    9.490942
Top 3 Country for  Thriller:
                                    Feature  Importance
147            Main country_ United Kingdom    8.246067
148  Main country_ United States of America    7.451452
101                    Main country_ France    4.314353
92                     Main country_ Canada    1.574820
------------------------------------------------------------------------------------------
-------------------------