In [50]:
import pandas as pd
from pyswip import Prolog
import numpy as np

In [51]:
# funzione che salva su un file i fatti
def save_to_file(strings, filename):
    with open(filename, 'a') as f:
        f.write('\n'.join(strings))

### Creazione della knowledge base

In [52]:
save_to_file([":-style_check(-discontiguous).\n"], 'facts.pl')

# fatti per i film
df = pd.read_csv('../dataset/movies_v2.csv')

for row in df.itertuples():
    facts = []
    facts.append(f"movie({row.id}).")
    facts.append(f"title({row.id}, '{row.title}').")
    facts.append(f"rating({row.id}, '{row.rating}').")
    facts.append(f"genre({row.id}, '{row.genre}').")
    facts.append(f"year({row.id}, {row.year}).")
    facts.append(f"score({row.id}, {row.score}).")
    facts.append(f"votes({row.id}, {row.votes}).")
    facts.append(f"country({row.id}, '{row.country}').")
    facts.append(f"budget({row.id}, {row.budget}).")
    facts.append(f"gross({row.id}, {row.gross}).")
    facts.append(f"company({row.id}, '{row.company}').")
    facts.append(f"runtime({row.id}, {row.runtime}).")
    # relazioni tra film e artisti
    facts.append(f"director({row.id}, '{row.director}').")
    facts.append(f"star({row.id}, '{row.star}').\n")

    save_to_file(facts, 'facts.pl')

# seleziono tutti i valori unici per la colonna 'director'
df_directors = df['director'].unique()
# seleziono tutti i valori unici per la colonna 'star'
df_stars = df['star'].unique()

# ottengo l'intersezione tra i valori unici per la colonna 'director' e 'star'
df_common = pd.Series(np.intersect1d(df_directors, df_stars))

# fatti per gli artisti
for name in df_common:
    facts = []
    facts.append(f"artist('{name}').")
    save_to_file(facts, 'facts.pl')

### Feature engineering con Prolog

In [53]:
clauses = []

# clausola per il conteggio dei film di un attore
clauses.append(("num_films_by_star(Star, Count) :-\n"
                "\tfindall(Movie, (star(Movie, Star), Movies),\n"
                "\tlength(Movies, Count)).\n\n"))

# clausola per il conteggio dei film di un regista
clauses.append(("num_films_by_director(Director, Count) :-\n"
                "\tfindall(Movie, (director(Movie, Director), Movies),\n"
                "\tlength(Movies, Count)).\n\n"))

# clausola per la media di profit di un attore
clauses.append(("average_profit_by_star(Star, AvgProfit) :-\n"
                "\tfindall(Profit, (star(Movie, Star), film_profit_index(Movie, Profit)), Profits),\n"
                "\tlength(Profits, N),\n"
                "\tsum_list(Profits, Total),\n"
                "\tAvgProfit is Total / N.\n\n"))

# clausola per la media di score di un attore
clauses.append(("average_score_by_star(Star, AvgScore) :-\n"
                "\tfindall(Score, (star(Movie, Star), score(Movie, Score)), Scores),\n"
                "\tlength(Scores, N),\n"
                "\tsum_list(Scores, Total),\n"
                "\tAvgScore is Total / N.\n\n"))

# clausola per la deviazione standard di profit di un attore
clauses.append(("std_dev_profit_by_star(Star, StdDev) :-\n"
                "\taverage_profit_by_star(Star, AvgProfit),\n"
                "\tfindall(Profit, (star(Movie, Star), film_profit_index(Movie, Profit)), Profits),\n"
                "\tlength(Profits, N),\n"
                "\tsum_list(Profits, Total),\n"
                "\tsum_of_squares(Profits, Total, AvgProfit, SumSq),\n"
                "\tVariance is SumSq / (N - 1),\n"
                "\tStdDev is sqrt(Variance).\n\n"))

# clausola per la deviazione standard di score di un attore
clauses.append(("std_dev_score_by_star(Star, StdDev) :-\n"
                "\taverage_score_by_star(Star, AvgScore),\n"
                "\tfindall(Score, (star(Movie, Star), score(Movie, Score)), Scores),\n"
                "\tlength(Scores, N),\n"
                "\tsum_list(Scores, Total),\n"
                "\tsum_of_squares(Scores, Total, AvgScore, SumSq),\n"
                "\tVariance is SumSq / (N - 1),\n"
                "\tStdDev is sqrt(Variance).\n\n"))

# definisci la clausola sum_of_squares/4 per calcolare la somma dei quadrati di una lista di valori
clauses.append(("sum_of_squares([], _, _, 0).\n"
                "\tsum_of_squares([Value|Rest], Total, Avg, SumSq) :-\n"
                "\tSumSqRest is SumSq + (Value - Avg)^2,\n"
                "\tsum_of_squares(Rest, Total, Avg, SumSqRest).\n\n"))

# clausola per la media di profit di un regista
clauses.append(("average_profit_by_director(Director, AvgProfit) :-\n"
                "\tfindall(Profit, (director(Movie, Director), film_profit_index(Movie, Profit)), Profits),\n"
                "\tlength(Profits, N),\n"
                "\tsum_list(Profits, Total),\n"
                "\tAvgProfit is Total / N.\n\n"))

# clausola per la media di score di un regista
clauses.append(("average_score_by_director(Director, AvgScore) :-\n"
                "\tfindall(Score, (director(Movie, Director), score(Movie, Score)), Scores),\n"
                "\tlength(Scores, N),\n"
                "\tsum_list(Scores, Total),\n"
                "\tAvgScore is Total / N.\n\n"))

# clausola per la deviazione standard di profit di un regista
clauses.append(("std_dev_profit_by_director(Director, StdDev) :-\n"
                "\taverage_profit_by_director(Director, AvgProfit),\n"
                "\tfindall(Profit, (director(Movie, Director), film_profit_index(Movie, Profit)), Profits),\n"
                "\tlength(Profits, N),\n"
                "\tsum_list(Profits, Total),\n"
                "\tsum_of_squares(Profits, Total, AvgProfit, SumSq),\n"
                "\tVariance is SumSq / (N - 1),\n"
                "\tStdDev is sqrt(Variance).\n\n"))

# clausola per la deviazione standard di score di un regista
clauses.append(("std_dev_score_by_director(Director, StdDev) :-\n"
                "\taverage_score_by_director(Director, AvgScore),\n"
                "\tfindall(Score, (director(Movie, Director), score(Movie, Score)), Scores),\n"
                "\tlength(Scores, N),\n"
                "\tsum_list(Scores, Total),\n"
                "\tsum_of_squares(Scores, Total, AvgScore, SumSq),\n"
                "\tVariance is SumSq / (N - 1),\n"
                "\tStdDev is sqrt(Variance).\n\n"))

# clausola per l'indice media/deviazione standard per profit di un regista
clauses.append(("director_profit_index(Director, Ratio) :-\n"
                "\tstd_dev_profit_by_director(Director, StdDev),\n"
                "\taverage_profit_by_director(Director, AvgProfit),\n"
                "\tRatio is AvgProfit / StdDev.\n"))

# clausola per l'indice media/deviazione standard per score di un regista
clauses.append(("director_score_index(Director, Ratio) :-\n"
                "\tstd_dev_score_by_director(Director, StdDev),\n"
                "\taverage_score_by_director(Director, AvgScore),\n"
                "\tRatio is AvgScore / StdDev.\n"))

# clausola per l'indice media/deviazione standard per profit di un attore
clauses.append(("star_profit_index(Star, Ratio) :-\n"
                "\tstd_dev_profit_by_star(Star, StdDev),\n"
                "\taverage_profit_by_star(Star, AvgProfit),\n"
                "\tRatio is AvgProfit / StdDev.\n"))

# clausola per l'indice media/deviazione standard per score di un attore
clauses.append(("star_score_index(Star, Ratio) :-\n"
                "\tstd_dev_score_by_star(Star, StdDev),\n"
                "\taverage_score_by_star(Star, AvgScore),\n"
                "\tRatio is AvgScore / StdDev.\n"))

# clausola per l'indice di utile di un film
clauses.append(("film_profit_index(Movie, Ratio) :-\n"
                "\tgross(Movie, Gross),\n"
                "\tbudget(Movie, Budget),\n"
                "\tBudget > 0,\n"
                "\tRatio is (Gross / Budget).\n"))

# clausola per l'indice di successo commerciale di un film
clauses.append(("film_success_index(Movie, Ratio) :-\n"
                "\tfilm_profit_index(Movie, ProfitIndex),\n"
                "\tgross(Movie, Gross),\n"
                "\tRatio is ProfitIndex * log(Gross).\n"))

# clausola per l'indice di culto di un film
clauses.append(("film_cult_index(Movie, Ratio) :-\n"
                "\tvotes(Movie, Votes),\n"
                "\tscore(Movie, Score),\n"
                "\tRatio is Score * log(Votes).\n"))

# clausola per la media di score dei film
clauses.append(("avg_score(AvgScore) :-\n"
                "\tfindall(Score, score(movie(_), Score), Scores),\n"
                "\tlength(Scores, N),\n"
                "\tN > 0,\n"
                "\tsum_list(Scores, TotalScore),\n"
                "\tAvgScore is TotalScore / N.\n"))

# clausola per la deviazione standard di score dei film
clauses.append(("std_dev_score(StdDev) :-\n"
                "\tavg_score(AvgScore),\n"
                "\tfindall(Score, score(movie(_), Score), Scores),\n"
                "\tlength(Scores, N),\n"
                "\tN > 1,\n"
                "\tsum_list(Scores, TotalScore),\n"
                "\tsum_of_squares(Scores, TotalScore, AvgScore, SumSq),\n"
                "\tVariance is SumSq / (N - 1),\n"
                "\tStdDev is sqrt(Variance).\n"))

# clausola per la qualit√† dei film
clauses.append(("film_quality(Movie, 'bassa') :-\n"
                "\tavg_score(AvgScore),\n"
                "\tstd_dev_score(StdDev),\n"
                "\tscore(Movie, Score),\n"
                "\tScore =< AvgScore - StdDev.\n"))

clauses.append(("film_quality(Movie, 'media') :-\n"
                "\tavg_score(AvgScore),\n"
                "\tstd_dev_score(StdDev),\n"
                "\tscore(Movie, Score),\n"
                "\tScore > AvgScore - StdDev,\n"
                "\tScore =< AvgScore + StdDev.\n"))

clauses.append(("film_quality(Movie, 'alta') :-\n"
                "\tavg_score(AvgScore),\n"
                "\tstd_dev_score(StdDev),\n"
                "\tscore(Movie, Score),\n"
                "\tScore > AvgScore + StdDev.\n"))

clauses.append(("remap_genre(Movie, 'Others') :-\n"
                "\tgenre(Movie, Genre),\n"
                "\tmember(Genre, ['Mystery', 'Thriller', 'Sci-Fy', 'Family', 'Romance', 'Western']).\n"))

clauses.append(("remap_genre(Movie, Genre) :-\n"
                "\tgenre(Movie, Genre),\n"
                "\t\\+ member(Genre, ['Mystery', 'Thriller', 'Sci-Fy', 'Family', 'Romance', 'Western']).\n"))

save_to_file(clauses, 'clauses.pl')

In [54]:
# definisci una funzione che derivi il nuovo dataset dalla base di conoscenza Prolog
def derive_new_dataset(prolog_kb):
    features = {}

    movies = pd.read_csv('../dataset/movies_v2.csv')

    for id in movies['id']:
        features['rating'] = list(prolog_kb.query(f"rating({id}, Rating)"))[0]['Rating']
        features['genre'] = list(prolog_kb.query(f"genre({id}, Genre)"))[0]['Genre']
        features['year'] = list(prolog_kb.query(f"year({id}, Year)"))[0]['Year']
        features['score'] = list(prolog_kb.query(f"score({id}, Score)"))[0]['Score']
        features['country'] = list(prolog_kb.query(f"country({id}, Country)"))[0]['Country']
        features['profit_index'] = list(prolog_kb.query(f"film_profit_index({id}, ProfitIndex)"))[0]['ProfitIndex']
        features['success_index'] = list(prolog_kb.query(f"film_success_index({id}, SuccessIndex)"))[0]['SuccessIndex']
        features['cult_index'] = list(prolog_kb.query(f"film_cult_index({id}, CultIndex)"))[0]['CultIndex']

        director = list(prolog_kb.query(f"director({id}, Director)"))[0]['Director']
        star = list(prolog_kb.query(f"star({id}, Star)"))[0]['Star']

        features['dictor_num_films'] = list(prolog_kb.query(f"num_films_by_director('{director}', NumFilms)"))[0]['NumFilms']
        features['star_num_films'] = list(prolog_kb.query(f"num_films_by_star('{star}', NumFilms)"))[0]['NumFilms']
        features['director_score_index'] = list(prolog_kb.query(f"director_score_index('{director}', ScoreIndex)"))[0]['ScoreIndex']
        features['director_profit_index'] = list(prolog_kb.query(f"director_profit_index('{director}', ProfitIndex)"))[0]['ProfitIndex']
        features['star_score_index'] = list(prolog_kb.query(f"star_score_index('{star}', ScoreIndex)"))[0]['ScoreIndex']
        features['star_profit_index'] = list(prolog_kb.query(f"star_profit_index('{star}', ProfitIndex)"))[0]['ProfitIndex']

    new_dataset = pd.DataFrame(features)
    new_dataset.to_csv('new_dataset.csv', index=False)

In [55]:
prolog = Prolog()
prolog.consult('facts.pl')
prolog.consult('clauses.pl')

derive_new_dataset(prolog)

PrologError: Caused by: 'num_films_by_director('Stanley Kubrick', NumFilms)'. Returned: 'error(instantiation_error, context(:(system, /(<meta-call>, 1)), _3428))'.