In [8]:
import pandas as pd
from pyswip import Prolog
import numpy as np

In [9]:
# funzione che salva su un file i fatti
def save_to_file(strings, filename):
    with open(filename, 'a') as f:
        f.write('\n'.join(strings))

### Creazione della knowledge base

In [10]:
save_to_file([":-style_check(-discontiguous).\n"], 'facts.pl')

# fatti per i film
df = pd.read_csv('../dataset/movies_v2.csv')

for row in df.itertuples():
    facts = []
    facts.append(f'movie({row.id}).')
    facts.append(f'title({row.id}, "{row.title}").')
    facts.append(f'rating({row.id}, "{row.rating}").')
    facts.append(f'genre({row.id}, "{row.genre}").')
    facts.append(f'year({row.id}, {row.year}).')
    facts.append(f'country({row.id}, "{row.country}").')
    facts.append(f'company({row.id}, "{row.company}").')
    facts.append(f'runtime({row.id}, {row.runtime}).')
    facts.append(f'budget({row.id}, {row.budget}).')
    facts.append(f'gross({row.id}, {row.gross}).')
    facts.append(f'score({row.id}, {row.score}).')
    facts.append(f'votes({row.id}, {row.votes}).')

    facts.append(f'directed_by({row.id}, "{row.director}").')
    facts.append(f'star({row.id}, "{row.star}").')

    save_to_file(facts, 'facts.pl')

# seleziono tutti i valori unici per la colonna 'director'
df_directors = df['director'].unique()
# seleziono tutti i valori unici per la colonna 'star'
df_actors = df['star'].unique()

# fatti per i registi
for director in df_directors:
    facts = []
    facts.append(f'director("{director}").')
    save_to_file(facts, 'facts.pl')

# fatti per gli attori
for actor in df_actors:
    facts = []
    facts.append(f'actor("{actor}").')
    save_to_file(facts, 'facts.pl')

### Feature engineering tramite KB

In [11]:
# funzione che esegue una query safe su una kb
def query_kb(prolog_kb, query):
    result = prolog_kb.query(query)
    if result:
        return list(result)[0]['X']
    return None

# funzione che deriva il nuovo dataframe dei movies dalla kb
def derive_movies_data(df, prolog_kb):
    new_data = []

    for movie_id in df['id']:
        features = {}

        features['id'] = movie_id
        features['title'] = query_kb(prolog_kb, f'title({movie_id}, X).')
        features['country'] = query_kb(prolog_kb, f'country({movie_id}, X).')
        features['company'] = query_kb(prolog_kb, f'company({movie_id}, X).')
        features['rating'] = query_kb(prolog_kb, f'rating({movie_id}, X).')
        features['votes'] = query_kb(prolog_kb, f'votes({movie_id}, X).')

        features['age'] = query_kb(prolog_kb, f'movie_age({movie_id}, X).')
        features['genre'] = query_kb(prolog_kb, f'genre_regrouped({movie_id}, X).')
        features['runtime'] = query_kb(prolog_kb, f'runtime({movie_id}, X).')
        features['score'] = query_kb(prolog_kb, f'score({movie_id}, X).')

        features['profit_index'] = query_kb(prolog_kb, f'movie_profit_index({movie_id}, X).')
        features['success_index'] = query_kb(prolog_kb, f'movie_success_index({movie_id}, X).')
        features['cult_index'] = query_kb(prolog_kb, f'movie_cult_index({movie_id}, X).')

        features['director'] = query_kb(prolog_kb, f'director({movie_id}, X).')
        features['star'] = query_kb(prolog_kb, f'star({movie_id}, X).')

        new_data.append(features)
    
    return pd.DataFrame(new_data)

# funzione che deriva il nuovo dataframe dei registi dalla kb
def derive_directors_data(df, prolog_kb):
    new_data = []

    for director in df:
        features = {}

        features['director'] = director
        features['director_num_movies'] = query_kb(prolog_kb, f'director_num_movies("{director}", X).')
        features['director_profit_index'] = query_kb(prolog_kb, f'director_profit_index("{director}", X).')
        features['director_score_index'] = query_kb(prolog_kb, f'director_score_index("{director}", X).')

        new_data.append(features)
    
    return pd.DataFrame(new_data)

# funzione che deriva il nuovo dataframe degli attori dalla kb
def derive_actors_data(df, prolog_kb):
    new_data = []

    for actor in df:
        features = {}

        features['actor'] = actor
        features['actor_num_movies'] = query_kb(prolog_kb, f'actor_num_movies("{actor}", X).')
        features['actor_profit_index'] = query_kb(prolog_kb, f'actor_profit_index("{actor}", X).')
        features['actor_score_index'] = query_kb(prolog_kb, f'actor_score_index("{actor}", X).')

        new_data.append(features)

In [None]:
# derivo i nuovi dataframe dalla kb
prolog = Prolog()
prolog.consult('facts.pl')
prolog.consult('clauses.pl')

movies = pd.read_csv('../dataset/movies_v2.csv')

new_movies = derive_movies_data(movies, prolog)
new_movies.to_csv('../dataset/movies_new.csv', index=False)

directors = movies['director'].unique()
new_directors = derive_directors_data(directors, prolog)
new_directors.to_csv('../dataset/directors_new.csv', index=False)

actors = movies['star'].unique()
new_actors = derive_actors_data(actors, prolog)
new_actors.to_csv('../dataset/actors_new.csv', index=False)