In [1]:
from kb_utils import *

### Creazione della KB

In [2]:
# creazione della kb
create_kb()

### Feature engineering tramite KB `facts.pl` e `clauses.pl`

In [3]:
# consulto della kb
prolog = Prolog()
prolog.consult('facts.pl')
prolog.consult('clauses.pl')


df_movies = pd.read_csv('../dataset/movies_v2.csv')

# derivo il dataframe dei film con features non binnate
df_movies_not_binned = derive_movies_data(df_movies, prolog, binning=False)
df_movies_not_binned.to_csv('../dataset/movies_not_binned.csv', index=False)

# derivo il dataframe dei film con features binnate
df_movies_binned = derive_movies_data(df_movies, prolog, binning=True)
df_movies_binned.to_csv('../dataset/movies_binned.csv', index=False)

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('../dataset/movies_v2.csv')

# nuove features per movies
df['age'] = 2024 - df['year']
df['profit_index'] = (df['gross'] - df['budget']) / df['budget']
df['cult_index'] = df['score'] * np.log(df['votes'])

def genre_regroup(genre):
    if genre in ['Fantasy', 'Mystery', 'Thriller', 'Sci-Fi', 'Family', 'Romance', 'Western']:
        return 'Other'
    return genre

df['genre'] = df['genre'].apply(genre_regroup)

def rating_regroup(rating):
    if rating in ['NC-17', 'TV-MA', 'Approved', 'X']:
        return 'Other'
    return rating

df['rating'] = df['rating'].apply(rating_regroup)


df['director_num_movies'] = None
df['director_age'] = None
df['director_profit_mean'] = None
df['director_profit_std'] = None
df['director_score_mean'] = None
df['director_score_std'] = None

df['actor_num_movies'] = None
df['actor_age'] = None
df['actor_profit_mean'] = None
df['actor_profit_std'] = None
df['actor_score_mean'] = None
df['actor_score_std'] = None

df_profs = pd.read_csv('../dataset/professionals.csv')

def compute_age(df, movie, df_profs, prof):
    return df[(df['id'] == movie)]['year'] - df_profs[df_profs['primaryName'] == prof]['birthYear']

def compute_director_mean(df, exclude, director, column):
    total, n = 0, 0

    for row in df.itertuples():
        if row.director == director and row.Index != exclude:
            total += getattr(row, column)
            n += 1

    if n > 0:
        return total / n
    else:
        return 0

def compute_director_std(df, exclude, director, column):
    total, n = 0, 0
    director_mean = compute_director_mean(df, exclude, director, column)

    for row in df.itertuples():
        if row.director == director and row.Index != exclude:
            total = total + (getattr(row, column) - director_mean)**2
            n += 1

    if n > 1:
        return total / (n - 1)
    else:
        return 0

def compute_actor_mean(df, exclude, actor, column):
    total, n = 0, 0

    for row in df.itertuples():
        if row.star == actor and row.Index != exclude:
            total += getattr(row, column)
            n += 1

    if n > 0:
        return total / n
    else:
        return 0
    
def compute_actor_std(df, exclude, actor, column):
    total, n = 0, 0
    actor_mean = compute_actor_mean(df, exclude, actor, column)

    for row in df.itertuples():
        if row.star == actor and row.Index != exclude:
            total = total + (getattr(row, column) - actor_mean)**2
            n += 1

    if n > 1:
        return total / (n - 1)
    else:
        return 0


for row in df.itertuples():
    df.at[row.Index, 'director_num_movies'] = len(df[df['director'] == row.director]) - 1
    df.at[row.Index, 'director_age'] = compute_age(df, row.Index, df_profs, row.director)
    df.at[row.Index, 'director_profit_mean'] = compute_director_mean(df, row.Index, row.director, 'profit_index')
    df.at[row.Index, 'director_profit_std'] = compute_director_std(df, row.Index, row.director, 'profit_index')
    df.at[row.Index, 'director_score_mean'] = compute_director_mean(df, row.Index, row.director, 'score')
    df.at[row.Index, 'director_score_std'] = compute_director_std(df, row.Index, row.director, 'score')

    df.at[row.Index, 'actor_num_movies'] = len(df[df['star'] == row.star]) - 1
    df.at[row.Index, 'actor_age'] = compute_age(df, row.Index, df_profs, row.star)
    df.at[row.Index, 'actor_profit_mean'] = compute_actor_mean(df, row.Index, row.star, 'profit_index')
    df.at[row.Index, 'actor_profit_std'] = compute_actor_std(df, row.Index, row.star, 'profit_index')
    df.at[row.Index, 'actor_score_mean'] = compute_actor_mean(df, row.Index, row.star, 'score')
    df.at[row.Index, 'actor_score_std'] = compute_actor_std(df, row.Index, row.star, 'score')


def bin_score(score):
    if score < df['score'].mean() - df['score'].std():
        return 'low'
    elif score >= df['score'].mean() - df['score'].std() and score < df['score'].mean():
        return 'mid-low'
    elif score >= df['score'].mean() and score < df['score'].mean() + df['score'].std():
        return 'mid-high'
    elif score >= df['score'].mean() + df['score'].std():
        return 'high'

df['quality'] = df['score'].apply(bin_score)

def bin_profit_index(profit_index):
    if profit_index < 1:
        return 'not-profitable'
    elif profit_index >= 1 and profit_index <= 3:
        return 'profitable'
    elif profit_index > 3:
        return 'very-profitable'
    
df['profitability'] = df['profit_index'].apply(bin_profit_index)

df = df.reindex(columns=['id', 'title', 'country', 'company', 'rating', 'genre',
                         'year', 'director', 'star', 'score', 'quality', 'profit_index', 'profitability',
                         'cult_index', 'age', 'runtime', 'votes', 'budget', 'gross',
                         'director_num_movies', 'director_age', 'director_profit_mean', 'director_profit_std', 'director_score_mean', 'director_score_std',
                         'actor_num_movies', 'actor_age', 'actor_profit_mean', 'actor_profit_std', 'actor_score_mean', 'actor_score_std'])

df.to_csv('../dataset/movies_not_binned.csv', index=False)

In [4]:
df = pd.read_csv('../dataset/movies_not_binned.csv')

def bin_num_movies(num_movies):
    if num_movies < 3:
        return 'low'
    elif num_movies >= 3 and num_movies <= 6:
        return 'mid'
    elif num_movies > 6:
        return 'high'

df['actor_num_movies_binned'] = df['actor_num_movies'].apply(bin_num_movies)
df['director_num_movies_binned'] = df['director_num_movies'].apply(bin_num_movies)

def bin_profit_mean(profit_mean):
    if profit_mean < 1:
        return 'not-profitable'
    elif profit_mean >= 1 and profit_mean < 3:
        return 'profitable'
    elif profit_mean >= 3:
        return 'very-profitable'

df['actor_profit_mean_binned'] = df['actor_profit_mean'].apply(bin_profit_mean)
df['director_profit_mean_binned'] = df['director_profit_mean'].apply(bin_profit_mean)

def bin_profit_std(profit_std):
    if profit_std < 1:
        return 'low'
    elif profit_std >= 1 and profit_std < 3:
        return 'mid'
    elif profit_std >= 3:
        return 'high'

df['actor_profit_std_binned'] = df['actor_profit_std'].apply(bin_profit_std)
df['director_profit_std_binned'] = df['director_profit_std'].apply(bin_profit_std)

def bin_score_mean(score_mean):
    if score_mean < df['score'].mean() - df['score'].std():
        return 'low'
    elif score_mean >= df['score'].mean() - df['score'].std() and score_mean < df['score'].mean():
        return 'mid-low'
    elif score_mean >= df['score'].mean() and score_mean < df['score'].mean() + df['score'].std():
        return 'mid-high'
    elif score_mean >= df['score'].mean() + df['score'].std():
        return 'high'

df['actor_score_mean_binned'] = df['actor_score_mean'].apply(bin_score_mean)
df['director_score_mean_binned'] = df['director_score_mean'].apply(bin_score_mean)

def bin_score_std(score_std):
    if score_std < 0.35:
        return 'low'
    elif score_std >= 0.35 and score_std < 0.7:
        return 'mid'
    elif score_std >= 0.7:
        return 'high'

df['actor_score_std_binned'] = df['actor_score_std'].apply(bin_score_std)
df['director_score_std_binned'] = df['director_score_std'].apply(bin_score_std)

def bin_cult_index(cult_index):
    if cult_index <= df['cult_index'].quantile(0.25):
        return 'low'
    elif cult_index > df['cult_index'].quantile(0.25) and cult_index <= df['cult_index'].quantile(0.5):
        return 'mid-low'
    elif cult_index > df['cult_index'].quantile(0.5) and cult_index <= df['cult_index'].quantile(0.75):
        return 'mid-high'
    elif cult_index > df['cult_index'].quantile(0.75):
        return 'high'

df['cult_index_binned'] = df['cult_index'].apply(bin_cult_index)

def bin_votes(votes):
    if votes <= df['votes'].quantile(0.25):
        return 'low'
    elif votes > df['votes'].quantile(0.25) and votes <= df['votes'].quantile(0.5):
        return 'mid-low'
    elif votes > df['votes'].quantile(0.5) and votes <= df['votes'].quantile(0.75):
        return 'mid-high'
    elif votes > df['votes'].quantile(0.75):
        return 'high'

df['votes_binned'] = df['votes'].apply(bin_votes)

def bin_budget(budget):
    if budget <= df['budget'].quantile(0.25):
        return 'low'
    elif budget > df['budget'].quantile(0.25) and budget <= df['budget'].quantile(0.5):
        return 'mid-low'
    elif budget > df['budget'].quantile(0.5) and budget <= df['budget'].quantile(0.75):
        return 'mid-high'
    elif budget > df['budget'].quantile(0.75):
        return 'high'

df['budget_binned'] = df['budget'].apply(bin_budget)

def bin_gross(gross):
    if gross <= df['gross'].quantile(0.25):
        return 'low'
    elif gross > df['gross'].quantile(0.25) and gross <= df['gross'].quantile(0.5):
        return 'mid-low'
    elif gross > df['gross'].quantile(0.5) and gross <= df['gross'].quantile(0.75):
        return 'mid-high'
    elif gross > df['gross'].quantile(0.75):
        return 'high'

df['gross_binned'] = df['gross'].apply(bin_gross)

def bin_age(age):
    if age <= 15:
        return 'new'
    elif age > 15 and age <= 30:
        return 'old'
    elif age > 30:
        return 'very-old'

df['age_binned'] = df['age'].apply(bin_age)

def bin_runtime(runtime):
    if runtime < 90:
        return 'short'
    elif runtime >= 90 and runtime < 150:
        return 'mid'
    elif runtime >= 150:
        return 'long'

df['runtime_binned'] = df['runtime'].apply(bin_runtime)

def bin_prof_age(age):
    if age <= 30:
        return 'young'
    elif age > 30 and age <= 60:
        return 'adult'
    elif age > 60:
        return 'old'
    
df['director_age_binned'] = df['director_age'].apply(bin_prof_age)
df['actor_age_binned'] = df['actor_age'].apply(bin_prof_age)

df['age_binned'] = df['age'].apply(bin_age)

df = df.drop(columns=['cult_index', 'votes', 'budget', 'gross', 'age', 'runtime'], axis=1)

df = df.rename(columns={'cult_index_binned': 'cult_index', 'votes_binned': 'votes', 'budget_binned': 'budget', 'gross_binned': 'gross', 'age_binned': 'age', 'runtime_binned': 'runtime'})

df = df.drop(columns=['actor_num_movies', 'actor_profit_mean', 'actor_profit_std', 'actor_score_mean', 'actor_score_std'])
df = df.drop(columns=['director_num_movies', 'director_profit_mean', 'director_profit_std', 'director_score_mean', 'director_score_std'])

df = df.rename(columns={'actor_num_movies_binned': 'actor_num_movies',
                                      'actor_age_binned': 'actor_age',
                                      'actor_profit_mean_binned': 'actor_profit_mean',
                                      'actor_profit_std_binned': 'actor_profit_std',
                                      'actor_score_mean_binned': 'actor_score_mean',
                                      'actor_score_std_binned': 'actor_score_std'})

df = df.rename(columns={'director_num_movies_binned': 'director_num_movies',
                                          'director_age_binned': 'director_age',
                                          'director_profit_mean_binned': 'director_profit_mean',
                                          'director_profit_std_binned': 'director_profit_std',
                                          'director_score_mean_binned': 'director_score_mean',
                                          'director_score_std_binned': 'director_score_std'})

df = df.reindex(columns=['id', 'title', 'country', 'company', 'rating', 'genre',
                         'year', 'director', 'star', 'score', 'quality', 'profit_index', 'profitability',
                         'cult_index', 'age', 'runtime', 'votes', 'budget', 'gross',
                         'director_num_movies', 'director_age', 'director_profit_mean', 'director_profit_std', 'director_score_mean', 'director_score_std',
                         'actor_num_movies', 'actor_age', 'actor_profit_mean', 'actor_profit_std', 'actor_score_mean', 'actor_score_std',])

df.to_csv('../dataset/movies_binned.csv', index=False)

TypeError: '<=' not supported between instances of 'str' and 'int'