In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm
tqdm.pandas()
import json
from collections import defaultdict
#from transformers import pipeline
import ast
import re
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from statsmodels import tools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns

import folium
import geopandas as gpd
from folium import Choropleth, CircleMarker, Popup

import joblib

# not displaying warnings
pd.options.mode.chained_assignment = None 

# to facilitate the use of py files
%load_ext autoreload
%autoreload 2



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from src.data.data_loader import *
from src.data.data_transformer import *
from src.utils.visualization import *
from src.utils.methods import *

In [4]:
movies_complete_df = preprocess_movies_complete(from_files=True) # to do once to generate the csv file

In [None]:
def preprocess_before_inference(df): 
    """
    Preprocess the data for use in our ML model 

    Args: 
        df (DataFrame): Dataframe on which we extract the data for the preprocessing
    Returns:
        df_preprocessed (DataFrame): preprocessed dataframe

    """
    df_without_emotions = df.copy(deep = True)
    df_full_emotions = df.copy(deep = True)

    df_without_emotions = df_without_emotions[df_without_emotions['bechdel_rating'].isna()]

    genres_list = df_without_emotions.explode('movie_genres')['movie_genres'].unique().tolist()
    countries_list = df_without_emotions.explode("movie_countries")["movie_countries"].unique().tolist()

    cols_df_genres_countries = pd.DataFrame(columns= genres_list + countries_list)
    df_without_emotions = pd.concat([df_without_emotions, cols_df_genres_countries], axis=1).fillna(0).reset_index(drop=True)

    for index, row in df_without_emotions.iterrows():
        genres = row["movie_genres"]
        countries = row["movie_countries"]
        for genre in genres:
            df_without_emotions.at[index, genre] = 1
        for country in countries:
            df_without_emotions.at[index, country] = 1
        
    # dropping old unformatted columns
    df_without_emotions = df_without_emotions.drop(columns=["actor_genders", "movie_genres", "movie_countries", "actor_genders", "emotion_scores", "dominant_emotion", "wikipedia_movie_id", "movie_name", "director_name", "actor_age"])
    df_without_emotions.columns = df_without_emotions.columns.astype(str)

    df_without_emotions["director_gender"] = df_without_emotions["director_gender"].apply(lambda x: int(0) if (x=='M') else int(1))


    
    df_full_emotions = df_full_emotions[df_full_emotions['bechdel_rating'].isna()]
    df_full_emotions = df_full_emotions.dropna(subset=['emotion_scores'])

    df_full_emotions["emotion_scores"] = df_full_emotions["emotion_scores"].str.replace("'", '"')
    # Parse the corrected strings into dictionaries
    df_full_emotions["emotion_scores"] = df_full_emotions["emotion_scores"].apply(json.loads)

    emotion_list = df_full_emotions["dominant_emotion"].unique().tolist()
    # add genre and countries columns
    cols_df = pd.DataFrame(columns= emotion_list) # genres_list + countries_list + 
    df_full_emotions = pd.concat([df_full_emotions, cols_df], axis=1).fillna(0).reset_index(drop=True)


    for index, row in df_full_emotions.iterrows():
        emotions_dict = row["emotion_scores"]
        for emotion in emotion_list:
            df_full_emotions.at[index, emotion] = emotions_dict[emotion]


    # dropping old unformatted columns
    df_full_emotions = df_full_emotions.drop(columns=["actor_genders", "movie_genres", "movie_countries", "actor_genders", "emotion_scores", "dominant_emotion", "wikipedia_movie_id", "movie_name", "director_name", "actor_age"])
    df_full_emotions.columns = df_full_emotions.columns.astype(str)

    # simplifying the bechdel_rating column into 0 (M) and 1(F)
    df_full_emotions["director_gender"] = df_full_emotions["director_gender"].apply(lambda x: int(0) if (x=='M') else int(1))

    complicated_columns = ['movie_budget', 'bechdel_rating', 'char_M', 'movie_release_date',
       'num_votes', 'char_F', 'director_gender', 'box_office_revenue',
       'average_rating', 'char_tot']
    
    df_full_emotions = df_full_emotions.drop(columns=complicated_columns)

    df_preprocessed = pd.concat([df_without_emotions, df_full_emotions], axis=1, join="outer")


    df_bechdel = obtain_df_bechdel_used_in_ML(movies_complete_df)
    column_bechdel = list((set(df_bechdel.columns)))
    df_preprocessed['Qatar'] = 0
    df_preprocessed = df_preprocessed[column_bechdel]
    # reorder columns
    df_preprocessed = df_preprocessed[df_bechdel.columns]
    # we don't need the bechdel_rating column for the inference
    df_preprocessed = df_preprocessed.drop(columns=['bechdel_rating'])

    
    return df_preprocessed

In [None]:
df_preprocessed = preprocess_before_inference(movies_complete_df)

In [7]:
df_preprocessed

Unnamed: 0,movie_release_date,director_gender,box_office_revenue,movie_budget,average_rating,num_votes,char_M,char_F,char_tot,Thriller,...,Palestine,United Arab Emirates,Puerto Rico,fear,neutral,sadness,anger,disgust,surprise,joy
0,2000,1,0.0,0.0,0.0,0.0,11,4,15,0,...,0,0,0,0.140068,0.069125,0.236309,0.331408,0.200978,0.015162,0.006950
1,1988,1,0.0,0.0,0.0,0.0,2,2,4,0,...,0,0,0,0.005076,0.179360,0.129934,0.284579,0.379688,0.009222,0.012142
2,1987,0,0.0,0.0,0.0,0.0,1,1,2,1,...,0,0,0,0.317639,0.044655,0.559689,0.021347,0.021774,0.006708,0.028188
3,1983,0,0.0,0.0,5.9,648.0,2,1,3,0,...,0,0,0,0.096375,0.373988,0.081687,0.244675,0.186290,0.005764,0.011221
4,2002,1,0.0,0.0,0.0,0.0,7,1,8,0,...,0,0,0,0.012456,0.169905,0.352295,0.008945,0.011762,0.032214,0.412423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56384,1941,0,0.0,0.0,6.1,1711.0,9,4,13,1,...,0,0,0,,,,,,,
56385,2007,1,0.0,0.0,0.0,0.0,9,1,10,0,...,0,0,0,,,,,,,
56386,1972,0,0.0,0.0,5.9,117.0,2,0,2,0,...,0,0,0,,,,,,,
56387,1992,1,0.0,0.0,0.0,0.0,10,5,15,0,...,0,0,0,,,,,,,


In [None]:
def obtain_prediction_bechdel(df_preprocessed):
    """
        Obtain the bechdel prediction for the movies that we don't have the bechdel rating for

        Args: 
            df_preprocessed (DataFrame): Dataframe on which we extract the data for the prediction
        Returns:
            df_bechdel_predictions (DataFrame): prediction dataframe with only the bechdel rating column

    """

    log_reg_model = joblib.load('log_reg_model.pkl')
    scaler = joblib.load('scaler.pkl')

    standardized_preprocessed = scaler.transform(df_preprocessed)
    standardized_preprocessed = np.nan_to_num(standardized_preprocessed)

    predictions_bechdel = log_reg_model.predict(standardized_preprocessed)

    df_bechdel_predictions = pd.DataFrame(predictions_bechdel, columns=['bechdel_rating'])

    return df_bechdel_predictions

In [20]:
predictions_bechdel = obtain_prediction_bechdel(df_preprocessed)
predictions_bechdel

array([0, 0, 0, ..., 0, 0, 1])

In [25]:
df_bechdel_predictions = obtain_prediction_bechdel(df_preprocessed)
df_bechdel_predictions

Unnamed: 0,bechdel_rating
0,0
1,0
2,0
3,1
4,1
...,...
56384,0
56385,0
56386,0
56387,0


In [29]:
def plot_bechdel_predictions(df_bechdel_predictions):
    """
    Plot the distribution of the bechdel predictions

    Args: 
        df_bechdel_predictions (DataFrame): Dataframe with the bechdel predictions
    Returns:
        None
    """

    value_counts = df_bechdel_predictions['bechdel_rating'].value_counts().reset_index()
    value_counts.columns = ['bechdel_rating', 'count']

    fig = px.bar(
        value_counts,
        x='bechdel_rating',
        y='count',
        text='count',
        labels={'bechdel_rating': 'Bechdel Rating', 'count': 'Count'},
        title='Distribution of Bechdel Ratings'
    )

    fig.update_traces(textposition='outside')
    fig.show()

    

In [30]:
plot_bechdel_predictions(df_bechdel_predictions)