In [None]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import matplotlib.pyplot as plt
import pandas as pd

from src.data.load import load_data
from src.data.prepare import prepare_data
from src.models.utils import leave_last_k

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [None]:
# load and prepare data
dfs = load_data(config=config['data_loader'])
dfs = prepare_data(dataframes=dfs, config=config["data_preparation"])

In [None]:
# train-test split
df_train, df_test = leave_last_k(df=dfs['data'], config=config['optimization'])

In [None]:
# calculate the number of ratings per user and item
for col in ['user_id', 'item_id']:
    df = df_train.groupby(by=[col]).size().reset_index().rename(columns={0: 'size'})

    # compute summary stats
    stats = {
        'Min': df['size'].min()
        , '1st Quartile': df['size'].quantile(0.25)
        , 'Median': df['size'].median()
        , 'Mean': df['size'].mean()
        , '3rd Quartile': df['size'].quantile(0.75)
        , 'Max': df['size'].max()
    }

    # plot the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(df['size'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)

    # add vertical lines for stats
    for label, value in stats.items():
        plt.axvline(x=value, color='red', linestyle='--', label=f'{label}: {value:.2f}')

    # plot settings
    plt.title(f'Distribution of Ratings Per {col.split(sep="_")[0].capitalize()}')
    plt.xlabel('Number of Ratings')
    plt.ylabel(f'Number of {col.split(sep="_")[0].capitalize()}s')
    plt.legend(loc='upper right')
    plt.grid(False)
    plt.show()

del col, df, stats, label, value

In [None]:
# ratings distribution
ratings_distribution = df_train['rating'].value_counts(normalize=True).reset_index()
ratings_distribution.columns = ['rating', 'percentage']

# plot bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(ratings_distribution['rating'], ratings_distribution['percentage'], color='skyblue', edgecolor='black')

# plot settings
plt.title('Ratings Distribution')
plt.xlabel('Rating')
plt.ylabel('Percentage of Ratings')
plt.xticks(ratings_distribution['rating'], rotation=0)

# add percentage to each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval, f'{yval:.2f}', ha='center', va='bottom', fontsize=10)

plt.show()
del ratings_distribution, bars, bar, yval

In [None]:
# ratings distribution at the user or item level
for col in ['user_id', 'item_id']:
    df = (
        df_train.groupby(by=[col, 'rating']).size()
        .reset_index().rename(columns={0: 'size'})
    )
    df = (
        df
        .merge(
            df.groupby(by=[col])['size'].sum()
            .reset_index().rename(columns={'size': 't_size'})
            , how='left', on=col
            )
        )
    df['share'] = (df['size']/df['t_size']*100).round(1)
    df = df.pivot(index=col, columns='rating', values='share').reset_index().fillna(0)

    df['1v2'] = df[1] + df[2]
    df['4v5'] = df[4] + df[5]

    display(df.describe().round(1))
del col, df

In [None]:
df = df_train.copy()
df['timestamp'] = pd.to_datetime(df_train['timestamp'], unit='s')
df['year_week'] = df['timestamp'].dt.strftime('%Y-%U')

for col in ['user_id', 'item_id']:
    df_i = df.groupby(by=['year_week'])[col].nunique().reset_index()

    # line plot to show active users per week
    plt.figure(figsize=(12, 6))
    plt.plot(df_i['year_week'], df_i[col], marker='o', linestyle='-', color='b')
    plt.title(f'Number of Active {col.split(sep="_")[0].capitalize()}s per Week')
    plt.xlabel('Year-Week')
    plt.ylabel(f'Active {col.split(sep="_")[0].capitalize()}s')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
del col, df_i, df

**Train/Test Split**

- Identify share of items which are in the test set but not in the train set

In [None]:
# TRAIN/TEST SPLIT
print(
    "Train set\n",
    f'- num. items: {df_train["item_id"].nunique()}\n',
    f'- pct. items: {round(df_train["item_id"].nunique()/dfs["data"]["item_id"].nunique()*100, 1)} %\n'
    )
print(
    "Test set\n",
    f'- num. items: {df_test["item_id"].nunique()}\n',
    f'- pct. items: {round(df_test["item_id"].nunique()/dfs["data"]["item_id"].nunique()*100, 1)} %\n'
    )

# items which are in test but not in train set
new_items_t = set(df_test["item_id"].unique()).difference(df_train["item_id"].unique())
print(
    "New Items\n",
    f'- num. items: {len(new_items_t)}\n',
    f'- pct. items: {round(len(new_items_t)/dfs["data"]["item_id"].nunique()*100, 1)}\n',
    "\n", "*"*10, "\n"
)

# TRAIN/VALIDATION SPLIT
df_train, df_valid = leave_last_k(df=df_train, config=config['optimization'])

print(
    "Train set\n",
    f'- num. items: {df_train["item_id"].nunique()}\n',
    f'- pct. items: {round(df_train["item_id"].nunique()/dfs["data"]["item_id"].nunique()*100, 1)} %\n'
    )
print(
    "Validation set\n",
    f'- num. items: {df_valid["item_id"].nunique()}\n',
    f'- pct. items: {round(df_valid["item_id"].nunique()/dfs["data"]["item_id"].nunique()*100, 1)} %\n'
    )

# items which are in validation but not in train set
new_items_v = set(df_valid["item_id"].unique()).difference(df_train["item_id"].unique())
print(
    "New Items\n",
    f'- num. items: {len(new_items_v)}\n',
    f'- pct. items: {round(len(new_items_v)/dfs["data"]["item_id"].nunique()*100, 1)}'
)

In [None]:
dfs["data"][dfs["data"]["item_id"].isin(new_items_v)].groupby(by=["item_id"]).size().describe()
dfs["data"][dfs["data"]["item_id"].isin(new_items_t)].groupby(by=["item_id"]).size().describe()

**Negative Sampling**