In [2]:
import pandas as pd
from pyswip import Prolog
import numpy as np

In [2]:
# funzione che salva su un file i fatti
def save_to_file(strings, filename):
    with open(filename, 'a') as f:
        f.write('\n'.join(strings))

### Creazione della knowledge base

In [3]:
save_to_file([":-style_check(-discontiguous).\n"], 'facts.pl')

# fatti per i film
df = pd.read_csv('../dataset/movies_v2.csv')

facts = []
for row in df.itertuples():
    facts.append(
        f'movie({row.id}).\n'
        f'title({row.id}, "{row.title}").\n'
        f'rating({row.id}, "{row.rating}").\n'
        f'genre({row.id}, "{row.genre}").\n'
        f'year({row.id}, {row.year}).\n'
        f'country({row.id}, "{row.country}").\n'
        f'company({row.id}, "{row.company}").\n'
        f'runtime({row.id}, {row.runtime}).\n'
        f'budget({row.id}, {row.budget}).\n'
        f'gross({row.id}, {row.gross}).\n'
        f'score({row.id}, {row.score}).\n'
        f'votes({row.id}, {row.votes}).\n'
        f'directed_by({row.id}, "{row.director}").\n'
        f'star({row.id}, "{row.star}").'
    )

save_to_file(facts, 'facts.pl')

# seleziono tutti i valori unici per la colonna 'director'
df_directors = df['director'].unique()
# seleziono tutti i valori unici per la colonna 'star'
df_actors = df['star'].unique()

facts = []
# fatti per i registi
for director in df_directors:
    facts.append(f'director("{director}").')

# fatti per gli attori
for actor in df_actors:
    facts.append(f'actor("{actor}").')

save_to_file(facts, 'facts.pl')

### Feature engineering tramite KB

In [4]:
# funzione che esegue una query safe su una kb
def query_kb(prolog_kb, query):
    result = prolog_kb.query(query)
    if result:
        return list(result)[0]['X']
    return None

# funzione che deriva il nuovo dataframe dei movies dalla kb
def derive_movies_data(df, prolog_kb):
    new_data = []

    for movie_id in df['id']:
        features = {}

        features['id'] = movie_id
        features['title'] = query_kb(prolog_kb, f'title({movie_id}, X).')
        features['country'] = query_kb(prolog_kb, f'country({movie_id}, X).')
        features['company'] = query_kb(prolog_kb, f'company({movie_id}, X).')
        features['rating'] = query_kb(prolog_kb, f'rating({movie_id}, X).')
        features['votes'] = query_kb(prolog_kb, f'votes({movie_id}, X).')

        features['age'] = query_kb(prolog_kb, f'movie_age({movie_id}, X).')
        features['genre'] = query_kb(prolog_kb, f'genre_regrouped({movie_id}, X).')
        features['runtime'] = query_kb(prolog_kb, f'runtime({movie_id}, X).')
        features['score'] = query_kb(prolog_kb, f'score({movie_id}, X).')

        features['profit_index'] = query_kb(prolog_kb, f'movie_profit_index({movie_id}, X).')
        features['success_index'] = query_kb(prolog_kb, f'movie_success_index({movie_id}, X).')
        features['cult_index'] = query_kb(prolog_kb, f'movie_cult_index({movie_id}, X).')

        features['director'] = query_kb(prolog_kb, f'directed_by({movie_id}, X).')
        features['star'] = query_kb(prolog_kb, f'star({movie_id}, X).')

        new_data.append(features)
    
    return pd.DataFrame(new_data)

# funzione che deriva il nuovo dataframe dei registi dalla kb
def derive_directors_data(df, prolog_kb):
    new_data = []

    for director in df:
        features = {}

        features['director'] = director
        features['director_num_movies'] = query_kb(prolog_kb, f'director_num_movies("{director}", X).')
        features['director_profit_mean'] = query_kb(prolog_kb, f'avg_profit_by_director("{director}", X).')
        features['director_profit_std'] = query_kb(prolog_kb, f'std_dev_profit_by_director("{director}", X).')
        features['director_score_mean'] = query_kb(prolog_kb, f'avg_score_by_director("{director}", X).')
        features['director_score_std'] = query_kb(prolog_kb, f'std_dev_score_by_director("{director}", X).')

        new_data.append(features)
    
    return pd.DataFrame(new_data)

# funzione che deriva il nuovo dataframe degli attori dalla kb
def derive_actors_data(df, prolog_kb):
    new_data = []

    for actor in df:
        features = {}

        features['actor'] = actor
        features['actor_num_movies'] = query_kb(prolog_kb, f'star_num_movies("{actor}", X).')
        features['actor_profit_mean'] = query_kb(prolog_kb, f'avg_profit_by_star("{actor}", X).')
        features['actor_profit_std'] = query_kb(prolog_kb, f'std_dev_profit_by_star("{actor}", X).')
        features['actor_score_mean'] = query_kb(prolog_kb, f'avg_score_by_star("{actor}", X).')
        features['actor_score_std'] = query_kb(prolog_kb, f'std_dev_score_by_star("{actor}", X).')

        new_data.append(features)

    return pd.DataFrame(new_data)

In [5]:
# derivo i nuovi dataframe dalla kb
prolog = Prolog()
prolog.consult('facts.pl')
prolog.consult('clauses.pl')

movies = pd.read_csv('../dataset/movies_v2.csv')

new_movies = derive_movies_data(movies, prolog)
new_movies.to_csv('../dataset/movies_new.csv', index=False)

directors = movies['director'].unique()
new_directors = derive_directors_data(directors, prolog)
new_directors.to_csv('../dataset/directors_new.csv', index=False)

actors = movies['star'].unique()
new_actors = derive_actors_data(actors, prolog)
new_actors.to_csv('../dataset/actors_new.csv', index=False)

In [3]:
df = pd.read_csv('../dataset/movies_v2.csv')

# nuove features per movies
df['age'] = 2024 - df['year']
df['profit_index'] = (df['gross'] - df['budget']) / df['budget']
df['success_index'] = df['profit_index'] * np.log(df['budget'])
df['cult_index'] = df['score'] * np.log(df['votes'])

def genre_regroup(genre):
    if genre in ['Mystery', 'Thriller', 'Sci-Fy', 'Family', 'Romance', 'Western']:
        return 'Others'
    return genre

df['genre'] = df['genre'].apply(genre_regroup)

def rating_regroup(rating):
    if rating in ['NC-17', 'TV-MA', 'Approved', 'X']:
        return 'Others'
    return rating

df['rating'] = df['rating'].apply(rating_regroup)

# nuove features per registi e attori
directors = df['director'].unique()
actors = df['star'].unique()

# creo i nuovi dataframe
directors_dict = {'director_name': directors}
df_directors = pd.DataFrame(directors_dict)

actors_dict = {'actor_name': actors}
df_actors = pd.DataFrame(actors_dict)


# derivo le nuove features
df_directors['director_num_movies'] = [len(df[df['director'] == name]) for name in df_directors['director_name']]

df_directors['director_profit_mean'] = [df[df['director'] == name]['profit_index'].mean() for name in df_directors['director_name']]

for name in df_directors.loc[df_directors['director_profit_mean'].isnull(), 'director_name']:
    df_directors.loc[df_directors['director_name'] == name, 'director_profit_mean'] = df[df['director'] == name]['profit_index']

df_directors['director_profit_std'] = [df[df['director'] == name]['profit_index'].std() for name in df_directors['director_name']]

df_directors['director_profit_std'] = df_directors['director_profit_std'].fillna(0)

df_directors['director_score_mean'] = [df[df['director'] == name]['score'].mean() for name in df_directors['director_name']]

for name in df_directors.loc[df_directors['director_score_mean'].isnull(), 'director_name']:
    df_directors.loc[df_directors['director_name'] == name, 'director_score_mean'] = df[df['director'] == name]['score']

df_directors['director_score_std'] = [df[df['director'] == name]['score'].std() for name in df_directors['director_name']]

df_directors['director_score_std'] = df_directors['director_score_std'].fillna(0)

df_actors['actor_num_movies'] = [len(df[df['star'] == name]) for name in df_actors['actor_name']]

df_actors['actor_profit_mean'] = [df[df['star'] == name]['profit_index'].mean() for name in df_actors['actor_name']]

for name in df_actors.loc[df_actors['actor_profit_mean'].isnull(), 'actor_name']:
    df_actors.loc[df_actors['actor_name'] == name, 'actor_profit_mean'] = df[df['star'] == name]['profit_index']

df_actors['actor_profit_std'] = [df[df['star'] == name]['profit_index'].std() for name in df_actors['actor_name']]

df_actors['actor_profit_std'] = df_actors['actor_profit_std'].fillna(0)

df_actors['actor_score_mean'] = [df[df['star'] == name]['score'].mean() for name in df_actors['actor_name']]

for name in df_actors.loc[df_actors['actor_score_mean'].isnull(), 'actor_name']:
    df_actors.loc[df_actors['actor_name'] == name, 'actor_score_mean'] = df[df['star'] == name]['score']

df_actors['actor_score_std'] = [df[df['star'] == name]['score'].std() for name in df_actors['actor_name']]

df_actors['actor_score_std'] = df_actors['actor_score_std'].fillna(0)



df_directors.to_csv('../dataset/directors_new.csv', index=False)
df_actors.to_csv('../dataset/actors_new.csv', index=False)
df.to_csv('../dataset/movies_new.csv', index=False)

In [4]:
df_directors = pd.read_csv('../dataset/directors_new.csv')
df_actors = pd.read_csv('../dataset/actors_new.csv')
df = pd.read_csv('../dataset/movies_new.csv')

# join finale tra i dataset
df = df.merge(df_directors, left_on='director', right_on='director_name', how='left')
df = df.merge(df_actors, left_on='star', right_on='actor_name', how='left')

df = df.drop(['director_name', 'actor_name'], axis=1)

df.to_csv('../dataset/movies_final_v1.csv', index=False)