# Data exploration 2.0

[MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/)


In [1]:
import os

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
MANUAL_SEED: int = 42

## Data loading


In [14]:
def load_occupations(path: str) -> dict[str, int]:
    with open(os.path.join(path, "u.occupation"), "r") as f:
        data = f.readlines()

    return {name.strip(): idx for idx, name in enumerate(data)}


def load_genres(path: str) -> list[str]:
    return pd.read_csv(
        os.path.join(path, "u.genre"),
        sep="|",
        header=None,
        names=["name", "genre_idx"],
        encoding="ISO-8859-1",
    )["name"].tolist()


def load_items(path: str, genres: list[str]) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.item"),
        sep="|",
        header=None,
        names=[
            "movie_id",
            "movie_title",
            "release_date",
            "video_release_date",
            "IMDb_URL",
            *genres,
        ],
        encoding="ISO-8859-1",
    )


def load_users(path: str) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.user"),
        sep="|",
        header=None,
        names=["user_id", "age", "gender", "occupation", "zip_code"],
        encoding="ISO-8859-1",
    )


def load_connections(path: str) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.data"),
        sep="\t",
        names=["user_id", "item_id", "rating", "timestamp"],
        engine="python",
    )


genres = load_genres("../data/interim/ml-100k/")
occupations_dict = load_occupations("../data/interim/ml-100k/")
raw_connections_df = load_connections("../data/interim/ml-100k/")
raw_items_df = load_items("../data/interim/ml-100k/", genres)
raw_users_df = load_users("../data/interim/ml-100k/")


num_users = len(raw_users_df)
num_items = len(raw_items_df)
print(f"{num_users=}\n{num_items=}")
raw_connections_df.head()

num_users=943
num_items=1682


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Data preprocessing

In [15]:
users_df = raw_users_df.copy()

In [21]:
def basic_users_encode(df: pd.DataFrame) -> pd.DataFrame:
    users_df = df.copy()

    users_df["gender"] = (raw_users_df["gender"] == "M").astype(int)
    users_df["age"] = raw_users_df["age"] / 100

    users_df.drop(columns=["zip_code"], inplace=True)

    users_df["occupation"] = raw_users_df["occupation"].apply(occupations_dict.get)

    return users_df

In [20]:
users_df = basic_users_encode(raw_users_df)
users_df.head()

Unnamed: 0,user_id,age,gender,occupation
0,1,0.24,1,19
1,2,0.53,0,13
2,3,0.23,1,20
3,4,0.24,1,19
4,5,0.33,0,13


In [33]:
def calculate_ratings(
    user_id: int, items_length: int, connections_df: pd.DataFrame
) -> list[float]:
    ratings = [0.0 for _ in range(items_length)]
    for _, r in connections_df[connections_df["user_id"] == user_id].iterrows():
        ratings[r["item_id"] - 1] = r["rating"]
    return ratings


def get_ratings_dict(
    df: pd.DataFrame, connections_df: pd.DataFrame, items_df: pd.DataFrame
) -> dict[int, list[float]]:
    item_ids = items_df["movie_id"].to_list()
    items_length = len(item_ids)

    ratings_dict = {}

    for user_id in df["user_id"].tolist():
        ratings_dict[user_id] = calculate_ratings(user_id, items_length, connections_df)

    return ratings_dict

In [34]:
ratings_dict = get_ratings_dict(users_df, raw_connections_df, raw_items_df)

In [65]:
for _, row in tqdm(users_df.iterrows()):
    r = row
    break

0it [00:00, ?it/s]


In [70]:
a = {"dsfds": "1were"}
a.update(r.to_dict())
a

{'dsfds': '1were',
 'user_id': 1.0,
 'age': 0.24,
 'gender': 1.0,
 'occupation': 19.0}

In [85]:
def calculate_genre_ratios(
    movie_indices: np.ndarray, items_df: pd.DataFrame
) -> np.ndarray:
    genres_sum = (
        items_df[items_df["movie_id"].isin(movie_indices + 1)]
        .iloc[:, 5:]
        .sum(axis=0)
        .to_numpy()
    )
    return genres_sum / genres_sum.sum()


def generate_masks_frames(
    users_df: pd.DataFrame,
    items_df: pd.DataFrame,
    ratings_dict: dict[int, list[float]],
    mask_percents: list[float] = [i / 10 for i in range(1, 10)],
    num_masks: int = 3,
    seed: int = MANUAL_SEED,
) -> list[pd.DataFrame]:
    np.random.seed(seed)

    data_frames = []

    for _ in range(num_masks):
        for mask_percent in mask_percents:
            user_dict = {}
            # for user_id in tqdm(users_df["user_id"].tolist()):
            for _, row in users_df.iterrows():
                user_id = row["user_id"]
                user_ratings = np.array(ratings_dict[user_id])
                watched_indices = user_ratings.nonzero()[0]
                num_watched = len(watched_indices)
                np.random.shuffle(watched_indices)

                stop_idx = int(mask_percent * num_watched)
                mask_watched_indices = watched_indices[:stop_idx]
                already_watched = watched_indices[stop_idx:]

                input_ratings = user_ratings.copy()
                input_ratings[mask_watched_indices] = 0.0

                genres_ratios = calculate_genre_ratios(already_watched, items_df)

                object_dict = row.to_dict()
                object_dict.update(
                    {
                        "input": input_ratings.tolist(),
                        "output": user_ratings.tolist(),
                        "genres": genres_ratios.tolist(),
                    }
                )
                for k, v in object_dict.items():
                    if k not in user_dict:
                        user_dict[k] = [v]
                        continue
                    user_dict[k].append(v)

            data_frames.append(pd.DataFrame.from_dict(user_dict))

    return data_frames

In [86]:
data_frames = generate_masks_frames(users_df, raw_items_df, ratings_dict)

In [93]:
for idx, data_frame in tqdm(enumerate(data_frames), total=len(data_frames)):
    data_frame.to_csv(f"../data/raw/user_masks/{idx}.csv", index=False)

100%|██████████| 27/27 [00:21<00:00,  1.25it/s]


## Read data

In [94]:
path = os.path.join(".", "../data/raw/user_masks/")
loaded_dfs = [
    pd.read_csv(os.path.join(path, file_name)) for file_name in os.listdir(path)
]
full_df = pd.concat(loaded_dfs)
full_df.head()

Unnamed: 0,user_id,age,gender,occupation,input,output,genres
0,1.0,0.24,1.0,19.0,"[5.0, 3.0, 4.0, 3.0, 3.0, 5.0, 4.0, 1.0, 5.0, ...","[5.0, 3.0, 4.0, 3.0, 3.0, 5.0, 4.0, 1.0, 5.0, ...","[0.0019047619047619048, 0.1219047619047619, 0...."
1,2.0,0.53,0.0,13.0,"[4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.08333333333333333, 0.02777777777777777..."
2,3.0,0.23,1.0,20.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.10810810810810811, 0.03603603603603603..."
3,4.0,0.24,1.0,19.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.14814814814814814, 0.07407407407407407..."
4,5.0,0.33,0.0,13.0,"[4.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0028735632183908046, 0.14942528735632185, 0..."
