In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

try:
    # Reading the data from a CSV file into a pandas DataFrame
    df_avec_successful = pd.read_csv('../Datasets/movies_cleaned_dataset.csv')
except FileNotFoundError as e:
    print(f"File not found: {e}")
    # Handle the exception (e.g., exit the script or log an error)
    exit()

# Filling missing values for specific columns with their mean
columns_to_fill = ['Inflation_adjusted_profit', 'averageRating', 'Oscar_Wins', 'Nominations']
for column in columns_to_fill:
    df_avec_successful[column].fillna(df_avec_successful[column].mean(), inplace=True)

# Capping extreme values in 'Inflation adjusted profit' to reduce outlier impact
# This reduces the impact of outliers by setting a threshold (90th percentile here)
# Values above this threshold are set to the threshold value itself
cap_threshold = df_avec_successful['Inflation_adjusted_profit'].quantile(0.90)
df_avec_successful['capped_profit'] = df_avec_successful['Inflation_adjusted_profit'].clip(upper=cap_threshold)

# Standardizing selected features (z-score normalization)
z_features = {
    'capped_profit': zscore(df_avec_successful['capped_profit']),
    'averageRating': zscore(df_avec_successful['averageRating']),
    'Oscar_Wins': zscore(df_avec_successful['Oscar_Wins']),
    'Nominations': zscore(df_avec_successful['Nominations'])
}

# Assigning weights to standardized features
weights = {
    'capped_profit': 0.3,
    'averageRating': 0.35,
    'Oscar_Wins': 0.175,
    'Nominations': 0.175
}

# Calculating a composite 'Successful' score using weighted features
df_avec_successful['Successful'] = sum(weights[feature] * z_features[feature] for feature in weights)

# Normalizing the 'Successful' score to a 0-10 scale for interpretability
min_score = df_avec_successful['Successful'].min()
max_score = df_avec_successful['Successful'].max()
df_avec_successful['Successful'] = round((df_avec_successful['Successful'] - min_score) / (max_score - min_score) * 10, 1)

# Sorting the DataFrame by 'Successful' score in descending order
df_avec_successful.sort_values(by='Successful', ascending=False, inplace=True)

# Display the top 300 rows
df_avec_successful.head(300)


In [None]:
df_avec_successful.describe() 

In [None]:
def get_transformer_feature_names(column_transformer):
    """
    Get feature names from a fitted ColumnTransformer.
    """
    output_features = []

    for name, pipe, features in column_transformer.transformers_:
        if name == 'remainder':
            # If the remainder is a passthrough, its feature names are the same as the column names
            if pipe == 'passthrough':
                output_features.extend(features)
            continue

        # For transformers with a get_feature_names_out method
        if hasattr(pipe, 'get_feature_names_out'):
            transformer_features = pipe.get_feature_names_out(features)
        else:
            transformer_features = features

        output_features.extend(transformer_features)

    return output_features

ML

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

data = df_avec_successful
X = data.drop('Successful', axis=1)
y = data['Successful']  # target

#preprocessing steps for both categorical and numeric data. Categorical features are filled with a
#placeholder value for any missing data and then one-hot encoded. Numeric features are imputed with their
#mean and then standardized. This transformed data is then used to train the RandomForestRegressor.
#Make sure to adjust the categorical_columns and numeric_columns lists to include all relevant features from your dataset.
#Selecting categorical and numeric columns

categorical_columns = ['Main_language', 'Main_country']  
numeric_columns = ['Movie_runtime']  

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # standardizing data
])

# Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns)
    ])

# Get a list of unique genres
top_genres = data['Top_genres'].value_counts().index
print(top_genres)
print(data['Top_genres'].value_counts())

#OPTIMISATION

# Hyperparameters grid for Random Forest
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}
# Hyperparameters grid for CatBoost
param_grid_catboost = {
    'classifier__iterations': [100, 500, 1000],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__depth': [4, 6, 10],
}
# Analysis for each genre
for genre in top_genres:
    print("------------------------------------------------------------------------------------------")
    print("------------------------------------------------------------------------------------------")
    print(f"Analyzing genre: {genre} - Data Points: {len(data[data['Top_genres'] == genre])}")

    genre_data = data[data['Top_genres'] == genre]
    genre_data.loc[genre_data['Main_language'] == 'Multilingual', 'Main_language'] = 'English Language'

    # Filter movies with success score greater than 6
    successful_movies = genre_data[genre_data['Successful'] >= 6]

    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    successful_movies = successful_movies.copy()

    # Perform your operation
    successful_movies['Movie_runtime'] = pd.to_numeric(successful_movies['Movie_runtime'], errors='coerce')

    mean_runtime = successful_movies['Movie_runtime'].dropna().mean()
    print(f"Optimal Runtime: {mean_runtime:.2f} minutes")

    try:
        X_genre = genre_data[categorical_columns]
        y_genre = genre_data['Successful']
        X_train, X_test, y_train, y_test = train_test_split(X_genre, y_genre, test_size=0.1, random_state=42)

        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', CatBoostRegressor(random_state=42, verbose=0))])

        # Grid search for hyperparameter tuning
        grid_search = GridSearchCV(clf, param_grid_catboost, cv=3, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)

        # Best model after grid search
        best_clf = grid_search.best_estimator_

        # Predict and evaluate
        y_pred = best_clf.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mse ** 0.5
        r2 = r2_score(y_test, y_pred)

        print(f"MSE: {mse}, RMSE: {rmse}, R²: {r2}")

        # Feature Importance Analysis
        feature_importances = best_clf.named_steps['classifier'].feature_importances_
        column_transformer = best_clf.named_steps['preprocessor']
        feature_names = get_transformer_feature_names(column_transformer)
       
        # Ensure the number of feature names matches the number of feature importances
        if len(feature_names) == len(feature_importances):
            feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
            sorted_feature_importance = feature_importance_df.sort_values(by='Importance', ascending=False)

            # Extracting and displaying top features excluding 'Main_language_missing'
            top_languages1 = sorted_feature_importance[~sorted_feature_importance['Feature'].str.contains('Main_language_missing')]
            top_languages = top_languages1[top_languages1['Feature'].str.contains('Main_language_')].head(6)
            countries= sorted_feature_importance[sorted_feature_importance['Feature'].str.contains('Main_country_')]
            top_countries = countries[~countries['Feature'].str.contains('missing')].head(6)

            print(f"Top 4 Languages for {genre}:\n{top_languages}")
            print(f"Top 4 Countries for {genre}:\n{top_countries}")
        else:
            print("Number of feature names and feature importances do not match.")

        import plotly.express as px
        import plotly.offline as pyo
        import pandas as pd
        import matplotlib.pyplot as plt
        import seaborn as sns
        import mpld3

        # List of countries to highlight
        def clean_country_name(country_name):
            cleaned_name = country_name.replace("Main_country_", "").replace(" of America", "")
            return cleaned_name

        # Apply the function to each element in the list
        cleaned_countries = [clean_country_name(country) for country in top_countries['Feature']]
        # Generate a unique color for each country
        colors = px.colors.qualitative.Plotly  # This is a list of colors provided by Plotly

        # Ensure there are enough colors for all countries
        assert len(colors) >= len(cleaned_countries), "Not enough colors available."

        # Create a DataFrame
        importance_values = sorted_feature_importance  # Replace with your actual importance values
       
        df = pd.DataFrame({
            'color': cleaned_countries,
            'country': cleaned_countries  # Use country names as color identifiers for now
        })

        # Create a color mapping: each country gets a different color
        color_mapping = {country: colors[i] for i, country in enumerate(cleaned_countries)}

        # Create the interactive world map
        fig = px.choropleth(df,
                            locations='color',
                            locationmode='country names',
                            color='country',
                            hover_name='color',
                            color_discrete_map=color_mapping,  # Use the color mapping
                            title='Top 6 recommended release country')

        # Show the figure
        fig.show()
        # Save the figure to an HTML file
        if genre == 'Action/Adventure':
            pyo.plot(fig, filename=f'catboost_info/mapactionadventure.html', auto_open=False)
        else:
            pyo.plot(fig, filename=f'catboost_info/map_{genre}.html', auto_open=False)

        # Create a beautiful bar plot    
        # Create a bar plot using Plotly Express
        
        top_languages['Feature'] = top_languages['Feature'].str.replace('Main_language_', '')
        top_languages['Feature'] = top_languages['Feature'].str.replace('Language', '')

        fig2 = px.bar(top_languages.head(5),
                    x='Importance', 
                    y='Feature',
                    title='Top 6 Important Language',
                    labels={'Importance': 'Language Importance', 'Feature': 'Languages'},
                    orientation='h',  # Horizontal bar plot
                    color='Importance',  # Color the bars by the 'Importance' value
                    color_continuous_scale='viridis')  # Use the 'viridis' color scale

        # Update layout for better readability
        fig2.update_layout(
            xaxis_title='Language Importance',
            yaxis_title='Languages',
            title_font_size=15,
            xaxis_tickangle=45
        )

        # Show the plot
        fig2.show()

        if genre == 'Action/Adventure':
            pyo.plot(fig2, filename=f'catboost_info/maplangueactionadventure.html', auto_open=False)

        else:
            pyo.plot(fig2, filename=f'catboost_info/maplangue_{genre}.html', auto_open=False)

    except Exception as e:
        print(f"An error occurred while analyzing genre {genre}: {e}")

cast : find best profile for each of the top 4 roles (sex, number of movies played in, age)
In the code below we trained XGB(boosted trees) model such that it can predicts success rate based on thses features 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np
import random

pd.options.mode.chained_assignment = None  # default='warn'

def load_data(filepath):
    """
    Load dataset from a specified filepath.
    """
    try:
        df = pd.read_csv(filepath)
        return df
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return None

def preprocess_data(df,genre):
    """
    Preprocess the dataset: drop missing values, filter by 'Successful' column,
    calculate actors' ages, and encode categorical data. Only keep a specified Genre.
    """
    df.dropna(inplace=True)
    df = df[df['Top_genres'] == genre]

    #df = df[df['Successful'] >= 7.5]

    df['Release_Date'] = pd.to_datetime(df['Release_Date'], format='%Y')


    # Ensure that birth year columns are numeric
    for role in ['role_1', 'role_2', 'role_3', 'role_4']:
        birth_year_column = f'{role}_birth_year'
        df[birth_year_column] = pd.to_numeric(df[birth_year_column], errors='coerce')
        #we use pd.to_numeric with errors='coerce' to convert the birth year columns to numeric types

    # Calculating ages of actors from their birth year and movie release year
    df['role_1_age'] = df['Release_Date'].dt.year - df['role_1_birth_year']
    df['role_2_age'] = df['Release_Date'].dt.year - df['role_2_birth_year']
    df['role_3_age'] = df['Release_Date'].dt.year - df['role_3_birth_year']
    df['role_4_age'] = df['Release_Date'].dt.year - df['role_4_birth_year']

    df['role_1_roles_count'] =  (df['role_1_roles_count'] * df['role_1_age']-30) / 60 
    df['role_2_roles_count'] =  (df['role_1_roles_count'] * df['role_2_age']-30) / 60
    df['role_3_roles_count'] =  (df['role_1_roles_count'] * df['role_3_age']-30) / 60
    df['role_4_roles_count'] =  (df['role_1_roles_count'] * df['role_4_age']-30) / 60

    # Encoding categorical data
    label_encoder = LabelEncoder()
    for role in ['role_1_sex', 'role_2_sex', 'role_3_sex', 'role_4_sex']:
        df[role] = label_encoder.fit_transform(df[role].astype(str))

    return df


def feature_selection(df):
    """
    Select and return features and target variable from the dataframe.
    """
    features = df[['role_1_age', 'role_2_age', 'role_3_age', 'role_4_age',
                   'role_1_roles_count', 'role_2_roles_count', 
                   'role_3_roles_count', 'role_4_roles_count',
                   'role_1_sex', 'role_2_sex', 'role_3_sex', 'role_4_sex']]
    target = df['Successful']

    # Align features and target by index to ensure they match
    features.dropna(inplace=True)
    target = target.loc[features.index]

    return features, target

def train_model(X_train, y_train):
    """
    Train the model using GridSearchCV with XGBoost.
    """
    param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.2]
    }
    param_grid2 = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 2, 3],
        'subsample': [0.7, 0.8, 0.9]
    }

    xgb = XGBRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid2, 
                           cv=3, n_jobs=-1, verbose=0, 
                           scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    return grid_search.best_estimator_

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on the test set and return the MSE and R² score.
    """
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

def feature_importance(model, features):
    """
    Generate a DataFrame of feature importances.
    """
    feature_importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': feature_importances})
    return importance_df.sort_values(by='Importance', ascending=False)

########################################################################################################################
#Purpose of the random search algorithm below is to find best combination of feature that maximisze success rate based on the importance of the parameters find above

def convert_profile_to_model_input(profile):
    # Convert the profile dictionary to a list or array in the order expected by the model
    # This function needs to be defined based on how your model expects the input
    # For example:
    model_input = []
    for role, features in profile.items():
        model_input.append(features['age'])
        model_input.append(features['roles_count'])
        model_input.append(1 if features['sex'] == 'M' else 0)  # Assuming sex is binary and encoded as 1/0
    return model_input

def optimize_profiles(model, initial_guess, iterations=1000):
    """
    Optimizes actor profiles to achieve the best success rate predicted by the model.
    
    Args:
    model: Trained machine learning model used for prediction.
    initial_guess: Initial guess of the profiles.
    iterations: Number of iterations for the optimization process.
    
    Returns:
    Tuple of the best profile and its corresponding success rate.
    """
    best_profile = initial_guess.copy()
    best_score = -np.inf

    for _ in range(iterations):
        current_profile = best_profile.copy()

        # Randomly adjust one parameter in one of the roles
        role_to_adjust = random.choice(list(current_profile.keys()))
        feature_to_adjust = random.choice(list(current_profile[role_to_adjust].keys()))

        if feature_to_adjust == 'age':
            current_profile[role_to_adjust]['age'] = random.choice(range(15,80))
        elif feature_to_adjust == 'roles_count':
            current_profile[role_to_adjust]['roles_count'] = random.choice(range(1,300))
        elif feature_to_adjust == 'sex':
            current_profile[role_to_adjust]['sex'] = random.choice(sex_range)

        # Convert the current profile for model input and predict success rate
        model_input = convert_profile_to_model_input(current_profile)
        success_rate = model.predict([model_input])[0]

        # Update best profile if current is better
        if success_rate > best_score:
            best_score = success_rate
            best_profile = current_profile

    return best_profile, best_score

########################################################################################################################
def similarity_score(actor_data, target_profile):
    age_diff = abs(actor_data['age'] - target_profile['age'])
    roles_count_diff = abs(actor_data['total_roles'] - target_profile['roles_count'])
    sex_match = actor_data['category'] == target_profile['sex']
    return age_diff * 0.5 + roles_count_diff * 0.5 + (0 if sex_match else 1000)


def find_best_matches(df, optimized_profile):
    """
    Finds the top 2 best matching actors for the given optimized profile.
    
    Args:
    df: DataFrame containing actor information.
    optimized_profile: Dictionary of the optimized profiles.
    
    Returns:
    Dictionary of top 2 best matching actors for each role.
    """
    top2_matches = {}
    for role, profile in optimized_profile.items():
        # Calculate similarity score for each actor in the dataframe
        df['similarity_score'] = df.apply(lambda row: similarity_score(row, profile), axis=1)
        
        # Sort the dataframe by similarity score and select the top 2 actors
        top2 = df.sort_values(by='similarity_score').head(2)
        
        # Store the names of the top 2 actors for each role
        top2_matches[role] = top2['name'].tolist()

    return top2_matches


def get_actor_names(nconst_lists, cast_df):
    """
    Retrieves actor names corresponding to the nconst values for each role. 
    If the primary choice (first nconst) leads to an IndexError, it tries the secondary choice, and so on.
    
    Args:
    nconst_lists: Dictionary of lists of nconst values for actors for each role.
    cast_df: DataFrame containing cast names and nconst.
    
    Returns:
    Dictionary of actor names for each role.
    """
    primary_names = {role: cast_name_df.loc[cast_name_df['nconst'] == actor_id, 'primaryName'].values[0] 
                 for role, actor_id in similar_actors.items() if cast_name_df['nconst'].str.contains(actor_id).any()}
    
    return primary_names


# Main execution
if __name__ == "__main__":
    filepath = '../Datasets/cast_processed_corrected_genres.csv'
    df = load_data(filepath)
    Result = {}

    if df is not None:
        Top_Genre =['Drama','Other','Family', 'Action/Adventure', 'Horror', 'Fiction']
        for genre in Top_Genre:
            print(f"Analyzing genre: {genre} - Data Points: {len(df[df['Top_genres'] == genre])}")

            ######ﬁ#############################################################################################################
            ######ﬁ###################### XGB to understand importance of each features ########################################
            ######ﬁ#############################################################################################################
            
            df_processed = preprocess_data(df,genre)
            features, target = feature_selection(df_processed)

            # Splitting the dataset
            X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
            X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
            X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

            # You can fill NaN values with the mean (or any other appropriate value)
            X_train.fillna(X_train.mean(), inplace=True)
            X_test.fillna(X_test.mean(), inplace=True)

            # Feature scaling
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Model training
            best_model = train_model(X_train_scaled, y_train)

            # Model evaluation
            mse_best, r2_best = evaluate_model(best_model, X_test_scaled, y_test)
            print(f"Optimized XGBoost MSE: {mse_best}, R²: {r2_best}")

            # Feature importance
            importance_df = feature_importance(best_model, features)

            # Print out the result
            roles = ['role_1', 'role_2', 'role_3', 'role_4']
            profiles = {}

            for role in roles:
                role_features = importance_df[importance_df['Feature'].str.startswith(role)]
                profiles[role] = role_features.sort_values(by='Importance', ascending=False)

            for role, profile in profiles.items():
                print(f"{role} Profile:")
                print(profile)
                print()

            ######ﬁ#############################################################################################################
            ######ﬁ################ find best combination of feature that maximisze success rate ###############################
            ######ﬁ#############################################################################################################

            #Purpose of the random search algorithm below is to find best combination of feature that maximisze success
            #rate based on the importance of the parameters found

            # Define the range of values and initial guesses
            age_range = range(10, 90)  # Age from 10 to 90
            role_count_range = range(1, 150)  # Role count from 1 to 150
            sex_range = ['M', 'F']  # Male and Female

            initial_guess = {
                'role_1': {'age': 0, 'roles_count': 0, 'sex': 'M'},
                'role_2': {'age': 0, 'roles_count': 0, 'sex': 'M'},
                'role_3': {'age': 0, 'roles_count': 0, 'sex': 'M'},
                'role_4': {'age': 0, 'roles_count': 0, 'sex': 'M'}
            }
            
            # Finding similar actors for each role
            optimized_profile, optimized_score = optimize_profiles(best_model, initial_guess, iterations=50000)
            print("Optimized Profile:", optimized_profile)
            
            ####################################################################################################################
            ##################### Looking for the best match in the database in order to find Actors ###########################
            ####################################################################################################################

            try:
                cast_name_df = pd.read_csv('../Datasets/top1000_actors.csv')
                similar_actors = find_best_matches(cast_name_df, optimized_profile)
                print("Best Actors:", similar_actors)

            except FileNotFoundError as e:
                print(f"File not found: {e}")


            # Plotting Feature Importance
            top_features = importance_df.head(16)

            # Creating the bar plot with Plotly Express
            fig3 = px.bar(top_features,
              x='Importance', 
              y='Feature',
              title=f'Important Features for Movie Success in {genre} : Actors',
              labels={'Importance': 'Feature Importance', 'Feature': 'Features'},
              orientation='h',  # Horizontal bar plot
              color='Importance',  # Color the bars by the 'Importance' value
              color_continuous_scale='viridis')  # Color scale for the bars

            # Displaying the plot
            fig3.show()

            if genre == 'Action/Adventure':
                pyo.plot(fig3, filename=f'catboost_info/featureimportance_actionadventure.html', auto_open=False)
            else:
                pyo.plot(fig3, filename=f'catboost_info/featureimportance_{genre}.html', auto_open=False)

        print(Result)

Analysis

Director : Find best profil based on the number of movies and the Director age 

Analysis 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")

def read_data(file_path):
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        return None

def preprocess_data(df, genre):
    df_genre = df[df['Top_genres'] == genre]
    df_genre = df_genre[['num_movies', 'Director_age', 'Successful', 'death_year', 'Director_name','Top_genres']].dropna()
    df_genre = df_genre[df_genre['Director_age'] >= 20]  # realistic age range
    return df_genre

def train_model(X_train, y_train):
    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    #print(f"MSE: {mse}, R²: {r2}")

def find_optimal_combination(model):
    best_score = -np.inf
    best_combination = (0, 0)

    for movies in range(1, 101):
        for age in range(20, 81):
            sample = np.array([[movies, age]])
            predicted_success = model.predict(sample)[0]
            if predicted_success > best_score:
                best_score = predicted_success
                best_combination = (movies, age)

    return best_combination
# Load the data
movies_df = read_data('../Datasets/movies_directors_with_genre.csv')
if movies_df is None:
    raise Exception("Data file not found. Please check the file path.")

# Define genres to analyze
genres = ['Drama', 'Family', 'Action/Adventure', 'Horror', 'Fiction']

# Initialize a dictionary to store results
optimal_directors_by_genre = {}
feature_importances_by_genre = {}
optimal_combinations = {}
processed_data_by_genre = {}

for genre in genres:
    df_genre = preprocess_data(movies_df, genre)
    processed_data_by_genre[genre] = df_genre  # Store the processed data for visualization
    X = df_genre[['num_movies', 'Director_age']]
    y = df_genre['Successful']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    best_model = train_model(X_train, y_train)
    evaluate_model(best_model, X_test, y_test)
    
    best_combination = find_optimal_combination(best_model)
    optimal_combinations[genre] = best_combination

    # Store feature importances
    feature_importances = best_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    feature_importances_by_genre[genre] = importance_df

    # Filter directors who are alive and whose profile is closest to the optimal combination

    df_genre = df_genre[df_genre['death_year'] == 0]

    df_genre['similarity'] = df_genre.apply(lambda row: 1/ np.sqrt((row['num_movies'] - best_combination[0])**2 + (row['Director_age'] - best_combination[1])**2), axis=1)
    best_director = []

    best_director.append(df_genre.sort_values(by='similarity', ascending=False).iloc[0]['Director_name'])
    if genre == 'Drama' or genre == 'Family':
        #For those genre iloc[0] == iloc[1] there are doublons, so to avoid that we just do that 
        best_director.append(df_genre.sort_values(by='similarity', ascending=False).iloc[3]['Director_name'])
    else :
        best_director.append(df_genre.sort_values(by='similarity', ascending=False).iloc[1]['Director_name'])


    optimal_directors_by_genre[genre] = best_director

print("Best Directors for Each Genre:", optimal_directors_by_genre)

# Visualization
for genre, df_genre in processed_data_by_genre.items():

    fig = px.scatter(df_genre, x='num_movies', y='Successful', title=f'Relationship Between Number of Movies and Success in {genre}')
    fig.show()

    fig2 = px.scatter(df_genre, x='Director_age', y='Successful', title=f'Relationship Between Director Age and Success in {genre}')
    fig2.show()
    if genre == 'Action/Adventure':
        pyo.plot(fig, filename=f'catboost_info/relationnummovie_actionadventure.html', auto_open=False)
        pyo.plot(fig2, filename=f'catboost_info/relationdirectorage_actionadventure.html', auto_open=False)

    else:
        pyo.plot(fig, filename=f'catboost_info/relationnummovie_{genre}.html', auto_open=False)
        pyo.plot(fig2, filename=f'catboost_info/relationdirectorage_{genre}.html', auto_open=False)



# Feature Importance Plot
for genre, importance_df in feature_importances_by_genre.items():
    fig = px.bar(importance_df,
                 x='Importance', 
                 y='Feature',
                 title=f'Important Features for Movie Success in {genre}',
                 labels={'Importance': 'Feature Importance', 'Feature': 'Features'},
                 orientation='h', 
                 color='Importance', 
                 color_continuous_scale='viridis')
    fig.show()
    if genre == 'Action/Adventure':
        pyo.plot(fig, filename=f'catboost_info/actorimportance_actionadventure.html', auto_open=False)

    else:
        pyo.plot(fig, filename=f'catboost_info/actorimportance_{genre}.html', auto_open=False)

# Optimal Combinations Plot
optimal_data = []
for genre, comb in optimal_combinations.items():
    optimal_data.append({'Genre': genre, 'num_movies': comb[0], 'Director_age': comb[1]})

optimal_df = pd.DataFrame(optimal_data)
fig2 = px.scatter(optimal_df, x='num_movies', y='Director_age', color='Genre',
                  title='Optimal Number of Movies and Director Age for Each Genre',
                  labels={'num_movies': 'Number of Movies', 'Director_age': 'Director Age'})
fig2.show()
