In [19]:
import numpy as np
import pandas as pd
from scipy import sparse
import os
import gc
import time
import importlib
import sklearn.preprocessing as pp
import sklearn.model_selection as ms
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

from recommender import item_based, dummy as dummy_recommender, preprocessing as rpp, model_selection as rms,\
    tools as rtools, experiment_tools

import tag_genome_builder as tg_builder
from tag_genome_builder import model_selection as ms_tg_builder

from lib import tools
import config
import config_tag_recommender
def reload():
    importlib.reload(item_based)
    importlib.reload(dummy_recommender)
    importlib.reload(config)
    importlib.reload(tg_builder)
    importlib.reload(rpp)
    importlib.reload(rms)
    importlib.reload(tools)
    importlib.reload(config_tag_recommender)
    importlib.reload(ms_tg_builder)
    importlib.reload(rtools)
    
    

# input params

In [20]:
reload()
str_aggregated_path = config_tag_recommender.str_aggregated_path
str_genome_scores = config_tag_recommender.str_genome_scores
str_tags = config_tag_recommender.str_tags
str_tag_ids = config_tag_recommender.str_tag_ids
str_rating_path = config_tag_recommender.str_rating_path
str_data_folder = config_tag_recommender.str_data_folder
minimum_no_of_frames = config_tag_recommender.minimum_no_of_frames
number_of_tag_per_movie = config_tag_recommender.number_of_tag_per_movie
n_jobs = 8

In [21]:
print('str_aggregated_path =', str_aggregated_path)
print('str_genome_scores =', str_genome_scores)
print('str_tags =', str_tags)
print('str_tag_ids =', str_tag_ids)
print('str_rating_path =', str_rating_path)
print('str_data_folder =', str_data_folder)
print('str_aggregated_path =', str_aggregated_path)
print('minimum_no_of_frames =', minimum_no_of_frames)
print('number_of_tag_per_movie =', number_of_tag_per_movie)

str_aggregated_path = C:\Users\shossein\Dropbox\Original Data\Elahi LowLevel Features\Mise-en-Scene Dataset_v1\LLVisualFeatures13K_Log.csv
str_genome_scores = ..\..\data\ml-25m\genome-scores.csv
str_tags = ..\..\data\ml-25m\tags.csv
str_tag_ids = ..\..\data\ml-25m\genome-tags.csv
str_rating_path = ..\..\data\ml-25m\ratings.csv
str_data_folder = ..\..\data
str_aggregated_path = C:\Users\shossein\Dropbox\Original Data\Elahi LowLevel Features\Mise-en-Scene Dataset_v1\LLVisualFeatures13K_Log.csv
minimum_no_of_frames = 10
number_of_tag_per_movie = 10


# Read inputs

In [22]:
df_tag_ids = pd.read_csv(str_tag_ids, index_col=config.tagId_col)

In [23]:
# df_tag_ids.head()

In [24]:
df_genome = pd.read_csv(str_genome_scores)

In [25]:
df_ratings = pd.read_csv(str_rating_path, nrows=100000)

In [26]:
# usecols = ['movieId', 'no_key_frames']
# for i in range(1,11):
#     usecols += [f'f{i}_median', f'f{i}_quartile1', f'f{i}_quartile3', f'f{i}_std'] 
# print(len(usecols))
usecols = None


df_agg = pd.read_csv(str_aggregated_path, nrows=None, usecols=usecols, index_col=config.movieId_col).sort_index()
# df_agg = df_agg[df_agg['no_key_frames'] >= minimum_no_of_frames]
df_agg.dropna(axis=1, thresh=len(df_agg) - 1000, inplace=True)

In [27]:
print('The size of datasets before filtering to the same:')
print('VF:', df_agg.shape,'Tag genome:', df_genome[config.movieId_col].nunique())
df_agg, df_genome = tg_builder.Base().filter_tag_and_vf_to_same(df_agg=df_agg, df_genome_scores=df_genome)
print('The size of datasets after filtering to the same:')
print('VF:', df_agg.shape,'Tag genome:', df_genome[config.movieId_col].nunique())

The size of datasets before filtering to the same:
VF: (13373, 7) Tag genome: 13816
The size of datasets after filtering to the same:
VF: (6359, 7) Tag genome: 6359


In [28]:
vf_item_features =rpp.ItemFeature()
vf_item_features.from_dataframe(df_agg)

In [29]:
item_features = rpp.get_item_feature_from_tag_genome(df_genome, number_of_tag_per_movie)

In [30]:
df_ratings_filtered = df_ratings[df_ratings[config.movieId_col].isin(item_features.item_ids)]
user_activities = df_ratings_filtered[config.userId_col].value_counts()
df_ratings_filtered = df_ratings_filtered[df_ratings_filtered[config.userId_col].\
                                          isin(user_activities[user_activities > 1].index)]

# Compute tag genome based on visual features

In [31]:
normalizer = pp.QuantileTransformer(output_distribution='normal')
# normalizer = pp.StandardScaler()


In [32]:
%%time
reload()
df_predicted_tag_genome = ms_tg_builder.cross_val_predict(df_visual_features=df_agg,
                                                          df_genome_scores=df_genome,
                                                          normalizer_vf=normalizer,
                                                          n_splits=10, 
                                                          n_jobs=n_jobs)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Wall time: 20.1 s


In [33]:
item_features_vf_tg = rpp.get_item_feature_from_tag_genome(df_predicted_tag_genome, number_of_tag_per_movie)

# Train test split
## rating dataset and tag item features

In [34]:
reload()
df_rating_train, df_rating_test = \
    rms.train_test_split(df_ratings_filtered, item_features=None, strategy='sklearn', test_size=.25)

In [35]:
vf_normalizer = tg_builder.VisualFeatureNormalizer()
normalizer = pp.StandardScaler()
df_agg_train = df_agg.loc[df_rating_train[config.movieId_col].unique()]
df_agg_train_normalized = vf_normalizer.fit_transform(df_agg_train, normalizer)
df_agg_test = df_agg.loc[df_rating_test[config.movieId_col].unique()]
df_agg_test_normalized = vf_normalizer.transform(df_agg_test)

item_features_vf_train = rpp.ItemFeature()
item_features_vf_train.from_dataframe(df_agg_train_normalized)
item_features_vf_test = rpp.ItemFeature()
item_features_vf_test.from_dataframe(df_agg_test_normalized)


## tag genome computed using visual features

In [36]:
reload()
item_features_vf_tg_train, item_features_vf_tg_test = \
(item_features_vf_tg.get_item_feature_by_list_of_items(df_rating_train[config.movieId_col].unique()),
item_features_vf_tg.get_item_feature_by_list_of_items(df_rating_test[config.movieId_col].unique()))

# Computing rating on test data

# Train the content based recommender and predict on test on VF tag genome

In [39]:
reload()
df_rating_test = \
experiment_tools.get_predictions_for_different_number_of_tags(df_predicted_tag_genome,
                                                   np.arange(1,11),
                                                   df_rating_train,
                                                   df_rating_test,
                                                   n_jobs=10)


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [None]:
df_rating_test

In [40]:
# for number_of_tag_per_movie in tqdm(range(1, 11), total=10):
#     prediction_column_suffix=f'vf_tg_{number_of_tag_per_movie}'
#     item_features_vf_tg = rpp.get_item_feature_from_tag_genome(df_predicted_tag_genome, number_of_tag_per_movie)
#     item_features_train =\
#         item_features_vf_tg.get_item_feature_by_list_of_items(df_rating_train[config.movieId_col].unique())
#     item_features_test =\
#         item_features_vf_tg.get_item_feature_by_list_of_items(df_rating_test[config.movieId_col].unique())
#     recommend = item_based.ItemBasedColabCos()
#     recommend.fit(df_rating_train, item_features_train)
#     test_users = df_rating_test[config.userId_col].unique()
#     recommendations = recommend.predict_on_list_of_users(test_users, df_rating_test, item_features_test, n_jobs=10)
#     df_rating_test = rtools.prepare_recommendations_df(df_rating_test=df_rating_test,
#                                                        recommendations=recommendations, 
#                                                        prediction_column_suffix=prediction_column_suffix)
#     print(prediction_column_suffix)
#     display(tools.performance_report(df_rating_test, prediction_column_suffix=prediction_column_suffix))
# #     df_rating_test.to_csv(f'df_rating_test_tg_vf{number_of_tag_per_movie}.csv')
# # rate_normalizer = rpp.RatingNormalizer()
# # df_rating_train[config.rating_col] = rate_normalizer.fit_transform(df_rating_train)
# # df_rating_test[config.rating_col] = rate_normalizer.transform(df_rating_test)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

vf_tg_1


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_1,3.565244,0.776589,1.013876,0.284378,0.060825,1.013912,0.000527,0.94543


vf_tg_2


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_2,3.574615,0.752768,0.974761,0.27269,0.131786,0.974794,-0.001501,0.987423


vf_tg_3


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_3,3.575363,0.747819,0.966361,0.270283,0.147295,0.966347,-0.009655,0.994502


vf_tg_4


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_4,3.576143,0.746159,0.964354,0.269663,0.150412,0.964353,-0.008165,0.996495


vf_tg_5


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_5,3.575913,0.744763,0.961348,0.26884,0.155734,0.961356,-0.006966,0.997251


vf_tg_6


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_6,3.575952,0.744125,0.960576,0.268621,0.156694,0.960589,-0.006063,0.998557


vf_tg_7


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_7,3.575907,0.743414,0.95949,0.268321,0.15866,0.959502,-0.006295,0.998694


vf_tg_8


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_8,3.575925,0.742774,0.958691,0.268096,0.16003,0.958709,-0.005278,0.9989


vf_tg_9


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_9,3.576076,0.742717,0.958202,0.267948,0.160781,0.958221,-0.005262,0.999175


vf_tg_10


Unnamed: 0,Average Score,MAE,RMSE,NRMSE,R2,Std of residuals,Avg of residuals,Coverage
vf_tg_10,3.576198,0.742603,0.958035,0.267892,0.16107,0.958045,-0.006575,0.999381





In [None]:
wait

# looking at the performance

In [None]:
df_rating_test.filter(regex='rating_predicted').hist()
1

In [None]:
prediction_column_suffixes = []
for c in df_rating_test.filter(regex='rating_predicted').columns:
    c = c.replace('rating_predicted', '')
    c = c[1:] if len(c) > 0 else c
    prediction_column_suffixes.append(c)
prediction_column_suffixes

In [None]:
reload()
df_performance_report = pd.concat([tools.performance_report(df_rating_test, 
                                                           prediction_column_suffix=alg) 
                                  for alg in prediction_column_suffixes]).sort_values('RMSE')
df_performance_report

## performance table for copying into Latex

In [None]:
print(df_performance_report.to_latex())

## Some plots

In [None]:
df_rating_test[config.rating_col].hist()

In [None]:
reload()
_ = tools.plot_side_by_side(tools.plot_prediction_histogram,
                            df_rating_pred=df_rating_test,
                            prediction_column_suffixes=prediction_column_suffixes)

In [None]:
reload()
_ = tools.plot_side_by_side(tools.plot_actual_vs_prediction,
                            df_rating_pred=df_rating_test,
                            prediction_column_suffixes=prediction_column_suffixes,
                            alpha=.1)

In [None]:
reload()
_ = tools.plot_side_by_side(tools.plot_actual_vs_predicted_boxplot,
                            df_rating_pred=df_rating_test,
                            prediction_column_suffixes=prediction_column_suffixes)

In [None]:
reload()
_ = tools.plot_side_by_side(tools.plot_residual_boxplot,
                            df_rating_pred=df_rating_test,
                            prediction_column_suffixes=prediction_column_suffixes)

In [None]:
reload()
_ = tools.plot_side_by_side(tools.plot_absolute_residual_boxplot,
                            df_rating_pred=df_rating_test,
                            prediction_column_suffixes=prediction_column_suffixes)

In [None]:
# reload()
# _ = tools.plot_side_by_side(tools.plot_actual_vs_predicted_violinplot,
#                             df_rating_pred=df_rating_test,
#                             prediction_column_suffixes=prediction_column_suffixes)

# Save the current notebook into results folder and push to the repository

In [None]:
%%javascript
IPython.notebook.save_notebook()

In [None]:
from notebook import notebookapp
import urllib
import json
import os
import ipykernel

currentTime = lambda : time.strftime('%Y-%m-%d-%H%M')
def notebook_path():
    """Returns the absolute path of the Notebook or None if it cannot be determined
    NOTE: works only when the security is token-based or there is also no password
    """
    connection_file = os.path.basename(ipykernel.get_connection_file())
    kernel_id = connection_file.split('-', 1)[1].split('.')[0]

    for srv in notebookapp.list_running_servers():
        try:
            if srv['token']=='' and not srv['password']:  # No token and no password, ahem...
                req = urllib.request.urlopen(srv['url']+'api/sessions')
            else:
                req = urllib.request.urlopen(srv['url']+'api/sessions?token='+srv['token'])
            sessions = json.load(req)
            for sess in sessions:
                if sess['kernel']['id'] == kernel_id:
                    return os.path.join(srv['notebook_dir'],sess['notebook']['path'])
        except:
            pass  # There may be stale entries in the runtime directory 
    return None

notebook_dir, notebook_name = os.path.split(notebook_path())
output = os.path.join(notebook_dir, 'output', f'{notebook_name}_{currentTime()}.ipynb')
os.system(f'jupyter nbconvert --to ipynb {notebook_path()} --output {output}')
os.system(f'git add {output}')
os.system(f'git commit -m "result computed on {currentTime()}"')
os.system('git push')

In [None]:
df_rating_test.to_csv(os.path.join(notebook_dir, 'output', f'{notebook_name}_{currentTime()}_df_rating_test.csv'))