In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from scipy import sparse
from project_utils import *
from pygsp import graphs, filters
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

%load_ext autoreload
%autoreload 2

In [2]:
# Load data and graph
actors_agg_adj = np.load("sparse_agg_actor_adj.npy")
actors_agg_df = pd.read_pickle("actors_agg_df.pkl")
actors_graph = nx.from_numpy_matrix(actors_agg_adj)
G = graphs.Graph(actors_agg_adj)

In [3]:
# Laplacian and spectral decomposition
G.compute_laplacian('combinatorial')
laplacian = G.L.toarray()
lam, U = np.linalg.eig(laplacian)

In [4]:
# Create ideal high-pass filter
ideal_hp = np.ones((actors_agg_adj.shape[0],))
ideal_hp[lam <= 1] = 0  # High-pass filter with cut-off at lambda=0.1

# Create ideal low-pass filter
ideal_lp = np.ones((actors_agg_adj.shape[0],))
ideal_lp[lam >= 0.1] = 0  # Low-pass filter with cut-off at lambda=0.1

# Create ideal band-pass filter
ideal_bp = np.ones((actors_agg_adj.shape[0],))
ideal_bp[lam < 0.1] = 0  # Band-pass filter with cut-offs at lambda=0.1 and lambda=0.5
ideal_bp[lam > 0.5] = 0

# Create ideal Tikhonov filter
alpha = 0.99 / np.max(lam)
ideal_tk = np.ones((actors_agg_adj.shape[0],))
ideal_tk = 1 / (1 + alpha * lam)

In [5]:
# Create polynomial graph filter with given order
def graph_filter(order, ideal):
    coeff = fit_polynomial(lam, order, ideal)
    return polynomial_graph_filter(coeff, laplacian)

In [6]:
# Create polynomial graph filters
order = 3
g_f_hp = graph_filter(order, ideal_hp) # High-pass
g_f_lp = graph_filter(order, ideal_lp) # Low-pass
g_f_bp = graph_filter(order, ideal_bp) # Band-pass
g_f_tk = graph_filter(order, ideal_tk) # Tikhonov

In [7]:
def apply_filter_and_add_communities(graph_filter):
    # Get only continous features
    continuous_features = actors_agg_df.filter(["budget", "revenue", "vote_average", "popularity"]).to_numpy()
    # Apply filter
    x_filtered = graph_filter @ continuous_features
    x_filtered_df = pd.DataFrame(data=x_filtered, columns=["budget", "revenue", "vote_average", "popularity"])
    # Update dataframe with new filtered columns
    new_df = actors_agg_df.copy()
    new_df.update(x_filtered_df)
    # Add communities column
    communities_df = pd.DataFrame(data=actors_with_comunities_dataset[:,-1], columns=["community"])
    communities_df["community"] = communities_df["community"].apply(np.int)
    new_df = new_df.merge(communities_df, left_index=True, right_index=True)
    return new_df

In [34]:
actors_with_comunities_dataset = np.load("actors_with_communities_dataset.npy")
# Apply filter, get new dataframes
hp_new_df = apply_filter_and_add_communities(g_f_hp) # high-pass filter new dataframe
lp_new_df = apply_filter_and_add_communities(g_f_lp) # low-pass filter new dataframe
bp_new_df = apply_filter_and_add_communities(g_f_bp) # band-pass filter new dataframe
tk_new_df = apply_filter_and_add_communities(g_f_tk) # Tikhonov filter new dataframe

In [35]:
cols = ["budget", "revenue", "vote_average", "popularity","community"]

In [36]:
hp_new_df = hp_new_df[cols]
lp_new_df = lp_new_df[cols]
bp_new_df = bp_new_df[cols]
tk_new_df = tk_new_df[cols]

## Linear regression with filters

In [75]:
cols = ["budget", "revenue", "vote_average", "popularity"]

In [76]:
Xs_hp, Xs_com_hp, ys_hp, ys_com_hp = get_datasets(hp_new_df, cols)
Xs_lp, Xs_com_lp, ys_lp, ys_com_lp = get_datasets(lp_new_df, cols)
Xs_bp, Xs_com_bp, ys_bp, ys_com_bp = get_datasets(bp_new_df, cols)
Xs_tk, Xs_com_tk, ys_tk, ys_com_tk = get_datasets(tk_new_df, cols)

In [66]:
seed = 42

In [106]:
test_size = 0.2

## Highpass

In [107]:
# Worse than vanilla LR
y_budget_pred, budget_nmae, _ = get_linear_reg_results(Xs_hp["budget"],ys_hp["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_nmae))

Normalized MAE budget: 0.06299409150343695


In [108]:
# Better than vanilla LR
y_revenue_pred, revenue_nmae, _ = get_linear_reg_results(Xs_hp["revenue"],ys_hp["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_nmae))

Normalized MAE revenue: 0.03596644272965434


In [109]:
# Worse than vanilla LR
y_popularity_pred, popularity_nmae, _ = get_linear_reg_results(Xs_hp["popularity"],ys_hp["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_nmae))

Normalized MAE popularity: 0.06469265096046435


In [110]:
# Better than vanilla LR
y_popularity_pred, popularity_nmae, _ = get_linear_reg_results(Xs_hp["vote_average"],ys_hp["vote_average"],test_size,seed)
print("Normalized MAE vote average: "+str(vote_average_nmae))

Normalized MAE vote average: 0.03345200714837459


## Highpass with communities

In [118]:
# Worse than vanilla LR
y_budget_com_pred, budget_com_nmae, _ = get_linear_reg_results(Xs_com_hp["budget"],ys_com_hp["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_com_nmae))

Normalized MAE budget: 0.05829442265740332


In [119]:
# Better than vanilla LR
y_revenue_com_pred, revenue_com_nmae, _ = get_linear_reg_results(Xs_com_hp["revenue"],ys_com_hp["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_com_nmae))

Normalized MAE revenue: 0.035399450146715694


In [121]:
# Worse than vanilla LR
y_popularity_com_pred, popularity_com_nmae, _ = get_linear_reg_results(Xs_com_hp["popularity"],ys_com_hp["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_com_nmae))

Normalized MAE popularity: 0.06371574392347595


In [117]:
# Better than vanilla LR
y_vote_average_com_pred, vote_average_com_nmae, _ = get_linear_reg_results(Xs_com_hp["vote_average"],ys_com_hp["vote_average"],test_size,seed)
print("Normalized MAE vote average: "+str(vote_average_com_nmae))

Normalized MAE vote average: 0.06988900420380362


## Low pass

In [133]:
# Better than vanilla LR
y_budget_pred, budget_com_nmae, _ = get_linear_reg_results(Xs_lp["budget"],ys_lp["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_com_nmae))

Normalized MAE budget: 0.03021694271060793


In [132]:
# Better than vanilla LR
y_revenue_pred, revenue_nmae, _ = get_linear_reg_results(Xs_lp["revenue"],ys_lp["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_nmae))

Normalized MAE revenue: 0.020584757899991157


In [131]:
# Worse than vanilla LR
y_popularity_pred, popularity_nmae, _ = get_linear_reg_results(Xs_lp["popularity"],ys_lp["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_nmae))

Normalized MAE popularity: 0.043734962807939794


In [130]:
# Better than vanilla LR
y_vote_average_pred, vote_average_nmae, _ = get_linear_reg_results(Xs_lp["vote_average"],ys_lp["vote_average"],test_size,seed)
print("Normalized MAE vote average: "+str(vote_average_nmae))

Normalized MAE vote average: 0.033396165578973146


## Low pass with communities

In [134]:
# Better than vanilla LR
y_budget_com_pred, budget_com_nmae, _ = get_linear_reg_results(Xs_com_lp["budget"],ys_com_lp["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_com_nmae))

Normalized MAE budget: 0.029486408159055212


In [135]:
# Better than vanilla LR
y_revenue_com_pred, revenue_com_nmae, _ = get_linear_reg_results(Xs_com_lp["revenue"],ys_com_lp["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_com_nmae))

Normalized MAE revenue: 0.020615900141906673


In [136]:
# Worse than vanilla LR
y_popularity_com_pred, popularity_com_nmae, _ = get_linear_reg_results(Xs_com_lp["popularity"],ys_com_lp["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_com_nmae))

Normalized MAE popularity: 0.04300490754563347


In [138]:
# Better than vanilla LR
y_vote_average_com_pred, vote_average_com_nmae, _ = get_linear_reg_results(Xs_com_lp["vote_average"],ys_com_lp["vote_average"],test_size,seed)
print("Normalized MAE vote average: "+str(vote_average_com_nmae))

Normalized MAE vote average: 0.03345200714837459


## Band pass

In [139]:
# Better than vanilla LR
y_budget_pred, budget_nmae, _ = get_linear_reg_results(Xs_bp["budget"],ys_bp["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_nmae))

Normalized MAE budget: 0.029864704534328285


In [143]:
# Better than vanilla LR
y_revenue_pred, revenue_nmae, _ = get_linear_reg_results(Xs_bp["revenue"],ys_bp["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_nmae))

Normalized MAE revenue: 0.019870952451698915


In [142]:
# Worse than vanilla LR
y_popularity_pred,popularity_nmae, _ = get_linear_reg_results(Xs_bp["popularity"],ys_bp["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_nmae))

Normalized MAE popularity: 0.04319556160629556


In [144]:
# Better than vanilla LR
y_vote_average_pred,vote_average_nmae, _ = get_linear_reg_results(Xs_bp["vote_average"],ys_bp["vote_average"],test_size,seed)
print("Normalized MAE vote_average: "+str(vote_average_nmae))

Normalized MAE vote_average: 0.0333420209335887


## Band pass with communities

In [145]:
# Better than vanilla LR
y_budget_com_pred, budget_com_nmae, _ = get_linear_reg_results(Xs_com_bp["budget"],ys_com_bp["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_com_nmae))

Normalized MAE budget: 0.029080765866805997


In [147]:
# Better than vanilla LR
y_revenue_com_pred, revenue_com_nmae, _ = get_linear_reg_results(Xs_com_bp["revenue"],ys_com_bp["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_com_nmae))

Normalized MAE revenue: 0.019902773498509982


In [148]:
# Better than vanilla LR
y_popularity_com_pred, popularity_com_nmae, _ = get_linear_reg_results(Xs_com_bp["popularity"],ys_com_bp["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_com_nmae))

Normalized MAE popularity: 0.04249996925963499


In [149]:
# Better than vanilla LR
y_vote_average_com_pred, vote_average_com_nmae, _ = get_linear_reg_results(Xs_com_bp["vote_average"],ys_com_bp["vote_average"],test_size,seed)
print("Normalized MAE vote average: "+str(vote_average_com_nmae))

Normalized MAE vote average: 0.03340285581751198


## Tikhonov

In [150]:
# Better than vanilla LR
y_budget_pred, budget_nmae, _ = get_linear_reg_results(Xs_tk["budget"],ys_tk["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_nmae))

Normalized MAE budget: 0.05046659069286355


In [151]:
# Better than vanilla LR
y_revenue_pred, revenue_nmae, _ = get_linear_reg_results(Xs_tk["revenue"],ys_tk["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_nmae))

Normalized MAE revenue: 0.028305018875898178


In [152]:
# Worse than vanilla LR
y_popularity_pred,popularity_nmae, _ = get_linear_reg_results(Xs_tk["popularity"],ys_tk["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_nmae))

Normalized MAE popularity: 0.06069344351326073


In [159]:
# Better than vanilla LR
y_vote_average_pred, vote_average_nmae, _ = get_linear_reg_results(Xs_tk["vote_average"],ys_tk["vote_average"],test_size,seed)
print("Normalized MAE vote average: "+str(vote_average_com_nmae))

Normalized MAE vote average: 0.051929251686079154


## Tikhonov with communities

In [154]:
# Better than vanilla LR
y_budget_com_pred, budget_com_nmae, _ = get_linear_reg_results(Xs_com_tk["budget"],ys_com_tk["budget"],test_size,seed)
print("Normalized MAE budget: "+str(budget_com_nmae))

Normalized MAE budget: 0.04649059228451098


In [155]:
# Better than vanilla LR
y_revenue_com_pred, revenue_com_nmae, _ = get_linear_reg_results(Xs_com_tk["revenue"],ys_com_tk["revenue"],test_size,seed)
print("Normalized MAE revenue: "+str(revenue_nmae_com))

Normalized MAE revenue: 0.028139101863799382


In [157]:
# Worse than vanilla LR
y_popularity_com_pred, popularity_com_nmae, _ = get_linear_reg_results(Xs_com_tk["popularity"],ys_com_tk["popularity"],test_size,seed)
print("Normalized MAE popularity: "+str(popularity_com_nmae))

Normalized MAE popularity: 0.05958096618106214


In [160]:
# Better than vanilla LR
y_vote_average_com_pred, vote_average_com_nmae, _ = get_linear_reg_results(Xs_com_tk["vote_average"],ys_com_tk["vote_average"],test_size,seed)
print("Normalized MAE vote average: "+str(vote_average_com_nmae))

Normalized MAE vote average: 0.051929251686079154
