# Data exploration 2.0

[MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/)


In [1]:
import contextlib
import os
from typing import Literal

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
MANUAL_SEED: int = 42

## Data loading


In [3]:
def load_occupations(path: str) -> dict[str, int]:
    with open(os.path.join(path, "u.occupation"), "r") as f:
        data = f.readlines()

    return {name.strip(): idx for idx, name in enumerate(data)}


def load_genres(path: str) -> list[str]:
    return pd.read_csv(
        os.path.join(path, "u.genre"),
        sep="|",
        header=None,
        names=["name", "genre_idx"],
        encoding="ISO-8859-1",
    )["name"].tolist()


def load_items(path: str, genres: list[str]) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.item"),
        sep="|",
        header=None,
        names=[
            "movie_id",
            "movie_title",
            "release_date",
            "video_release_date",
            "IMDb_URL",
            *genres,
        ],
        encoding="ISO-8859-1",
    )


def load_users(path: str) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.user"),
        sep="|",
        header=None,
        names=["user_id", "age", "gender", "occupation", "zip_code"],
        encoding="ISO-8859-1",
    )


def load_connections(path: str) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.data"),
        sep="\t",
        names=["user_id", "item_id", "rating", "timestamp"],
        engine="python",
    )


genres = load_genres("../data/raw/ml-100k/")
occupations_dict = load_occupations("../data/raw/ml-100k/")
raw_connections_df = load_connections("../data/raw/ml-100k/")
raw_items_df = load_items("../data/raw/ml-100k/", genres)
raw_users_df = load_users("../data/raw/ml-100k/")


num_users = len(raw_users_df)
num_items = len(raw_items_df)
print(f"{num_users=}\n{num_items=}")
raw_connections_df.head()

num_users=943
num_items=1682


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Data preprocessing

In [4]:
users_df = raw_users_df.copy()

In [5]:
def basic_users_encode(df: pd.DataFrame) -> pd.DataFrame:
    users_df = df.copy()

    users_df["gender"] = (raw_users_df["gender"] == "M").astype(int)
    users_df["age"] = raw_users_df["age"] / 100

    users_df.drop(columns=["zip_code"], inplace=True)

    users_df["occupation"] = raw_users_df["occupation"].apply(occupations_dict.get)

    return users_df

In [6]:
users_df = basic_users_encode(raw_users_df)
users_df.head()

Unnamed: 0,user_id,age,gender,occupation
0,1,0.24,1,19
1,2,0.53,0,13
2,3,0.23,1,20
3,4,0.24,1,19
4,5,0.33,0,13


In [7]:
def calculate_ratings(
    user_id: int, items_length: int, connections_df: pd.DataFrame
) -> list[float]:
    ratings = [0.0 for _ in range(items_length)]
    for _, r in connections_df[connections_df["user_id"] == user_id].iterrows():
        ratings[r["item_id"] - 1] = r["rating"]
    return ratings


def get_ratings_dict(
    df: pd.DataFrame, connections_df: pd.DataFrame, items_df: pd.DataFrame
) -> dict[int, list[float]]:
    item_ids = items_df["movie_id"].to_list()
    items_length = len(item_ids)

    ratings_dict = {}

    for user_id in df["user_id"].tolist():
        ratings_dict[user_id] = calculate_ratings(user_id, items_length, connections_df)

    return ratings_dict

In [8]:
ratings_dict = get_ratings_dict(users_df, raw_connections_df, raw_items_df)

In [17]:
NUM_GENRES = len(genres)
NUM_GENRES

19

In [24]:
def calculate_genre_ratios(
    movie_indices: np.ndarray, movie_ratings: np.ndarray, items_df: pd.DataFrame
) -> np.ndarray:
    ratios = np.zeros(NUM_GENRES)
    for movie_id, rating in zip(movie_indices + 1, movie_ratings):
        ratios += (
            items_df[items_df["movie_id"] == movie_id].iloc[:, 5:].to_numpy()[0] * rating
        )
    return ratios / (
        len(movie_indices) * 5.0
    )  # the best scenario - all watched have rating 5


def update_user_dict(
    row: pd.Series,
    user_dict: dict,
    items_df: pd.DataFrame,
    ratings_dict: dict[int, list[float]],
    mask_percent: float,
) -> dict:
    user_id = row["user_id"]
    user_ratings = np.array(ratings_dict[user_id])
    watched_indices = user_ratings.nonzero()[0]
    num_watched = len(watched_indices)
    np.random.shuffle(watched_indices)

    stop_idx = int(mask_percent * num_watched)
    mask_watched_indices = watched_indices[:stop_idx]
    already_watched = watched_indices[stop_idx:]
    watched_ratings = user_ratings[already_watched]

    input_ratings = user_ratings.copy()
    input_ratings[mask_watched_indices] = 0.0

    genres_ratios = calculate_genre_ratios(already_watched, watched_ratings, items_df)

    object_dict = row.to_dict()
    object_dict.update(
        {
            "input": input_ratings.tolist(),
            "output": user_ratings.tolist(),
            "genres": genres_ratios.tolist(),
        }
    )
    for k, v in object_dict.items():
        if k not in user_dict:
            user_dict[k] = [v]
            continue
        user_dict[k].append(v)

    return user_dict


def generate_data_frames(
    users_df: pd.DataFrame,
    items_df: pd.DataFrame,
    ratings_dict: dict[int, list[float]],
    split_type: Literal["users"] | Literal["masks"] = "masks",
    mask_percents: list[float] = [0.2, 0.4, 0.6, 0.8, 0.9],
    num_masks: int = 4,
    seed: int = MANUAL_SEED,
    label: str = "",
) -> list[pd.DataFrame]:
    np.random.seed(seed)

    loop = tqdm(total=num_masks * len(mask_percents) * len(users_df), desc=label)

    data_frames = []
    if split_type == "masks":
        for _ in range(num_masks):
            for mask_percent in mask_percents:
                user_dict = {}
                for _, row in users_df.iterrows():
                    user_dict = update_user_dict(
                        row, user_dict, items_df, ratings_dict, mask_percent
                    )
                    loop.update(1)
                data_frames.append(pd.DataFrame.from_dict(user_dict))
    else:
        for _, row in users_df.iterrows():
            user_dict = {}
            for _ in range(num_masks):
                for mask_percent in mask_percents:
                    user_dict = update_user_dict(
                        row, user_dict, items_df, ratings_dict, mask_percent
                    )
                    loop.update(1)
            data_frames.append(pd.DataFrame.from_dict(user_dict))

    loop.close()
    return data_frames

In [25]:
data_frames_masks_split = generate_data_frames(
    users_df, raw_items_df, ratings_dict, split_type="masks", label="Masks split"
)
data_frames_users_split = generate_data_frames(
    users_df, raw_items_df, ratings_dict, split_type="users", label="Users split"
)

Masks split:   0%|          | 0/18860 [00:00<?, ?it/s]

Masks split: 100%|██████████| 18860/18860 [05:54<00:00, 53.13it/s] 
Users split: 100%|██████████| 18860/18860 [05:20<00:00, 58.82it/s] 


## Save data

In [26]:
def save_mkdir(path: str):
    with contextlib.suppress(Exception):
        os.makedirs(path)


def save_on_disk(data_frames: list[pd.DataFrame], path: str, label: str = ""):
    save_mkdir(path)
    for idx, data_frame in tqdm(
        enumerate(data_frames), total=len(data_frames), desc=label
    ):
        data_frame.to_csv(os.path.join(path, f"{idx}.csv"), index=False)


def save_data(
    data_frames: list[pd.DataFrame], path: str, train_ratio: float, label: str = ""
):
    stop_idx = int(len(data_frames) * train_ratio)

    train_dfs = data_frames[:stop_idx]
    test_dfs = data_frames[stop_idx:]

    save_on_disk(train_dfs, os.path.join(path, "train"), label=f"{label} | Saving train")
    save_on_disk(test_dfs, os.path.join(path, "test"), label=f"{label} | Saving test")

In [27]:
TRAIN_RATIO = 0.9

save_data(
    data_frames_masks_split,
    "../data/interim/masks_split",
    TRAIN_RATIO,
    label="Masks split",
)
save_data(
    data_frames_users_split,
    "../data/interim/users_split",
    TRAIN_RATIO,
    label="Users split",
)

Masks split | Saving train: 100%|██████████| 18/18 [00:11<00:00,  1.62it/s]
Masks split | Saving test: 100%|██████████| 2/2 [00:01<00:00,  1.68it/s]
Users split | Saving train: 100%|██████████| 848/848 [00:15<00:00, 53.73it/s]
Users split | Saving test: 100%|██████████| 95/95 [00:01<00:00, 64.66it/s]


## Read data

In [28]:
path = os.path.join(".", "../data/interim/masks_split/test/")
loaded_dfs = [
    pd.read_csv(os.path.join(path, file_name)) for file_name in os.listdir(path)
]
train_df = pd.concat(loaded_dfs)
train_df.head()

Unnamed: 0,user_id,age,gender,occupation,input,output,genres
0,1.0,0.24,1.0,19.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5.0, 3.0, 4.0, 3.0, 3.0, 5.0, 4.0, 1.0, 5.0, ...","[0.0, 0.15636363636363637, 0.10181818181818182..."
1,2.0,0.53,0.0,13.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.13846153846153847, 0.07692307692307693..."
2,3.0,0.23,1.0,20.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.12727272727272726,..."
3,4.0,0.24,1.0,19.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.2, 0.0, 0.0, 0.0, 0.2, 0.6, 0.0, 0.4, ..."
4,5.0,0.33,0.0,13.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.2571428571428571, 0.13142857142857142,..."
