# Data exploration

[MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/)


In [2]:
import os

import pandas as pd

In [54]:
genres

NameError: name 'genres' is not defined

## Data loading


In [56]:
def load_genres(path: str) -> list[str]:
    return pd.read_csv(
        os.path.join(path, "u.genre"),
        sep="|",
        header=None,
        names=["name", "genre_idx"],
        encoding="ISO-8859-1",
    )["name"].tolist()


def load_items(path: str, genres: list[str]) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.item"),
        sep="|",
        header=None,
        names=[
            "movie_id",
            "movie_title",
            "release_date",
            "video_release_date",
            "IMDb_URL",
            *genres,
        ],
        encoding="ISO-8859-1",
    )


def load_users(path: str) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.user"),
        sep="|",
        header=None,
        names=["user_id", "age", "gender", "occupation", "zip_code"],
        encoding="ISO-8859-1",
    )


def load_connections(path: str) -> pd.DataFrame:
    return pd.read_csv(
        os.path.join(path, "u.data"),
        sep="\t",
        names=["user_id", "item_id", "rating", "timestamp"],
        engine="python",
    )


genres = load_genres("../data/interim/ml-100k/")
raw_connections_df = load_connections("../data/interim/ml-100k/")
raw_items_df = load_items("../data/interim/ml-100k/", genres)
raw_users_df = load_users("../data/interim/ml-100k/")


num_users = len(raw_users_df)
num_items = len(raw_items_df)
print(f"{num_users=}\n{num_items=}")
raw_connections_df.head()

num_users=943
num_items=1682


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Data preprocessing

In [13]:
users_df = raw_users_df.copy()

In [47]:
def read_occupations(path: str) -> dict[str, int]:
    with open(os.path.join(path, "u.occupation"), "r") as f:
        data = f.readlines()

    return {name.strip(): idx for idx, name in enumerate(data)}


occupations_dict = read_occupations("../data/interim/ml-100k/")

In [156]:
def custom_hash(x: str) -> int:
    return sum([ord(x) * 36**i for i, x in enumerate(list(x))])


def basic_users_encode(df: pd.DataFrame) -> pd.DataFrame:
    users_df = df.copy()

    users_df["gender"] = (raw_users_df["gender"] == "M").astype(int)
    users_df["age"] = raw_users_df["age"] / 100

    users_df["zip_code"] = raw_users_df["zip_code"].apply(custom_hash)
    max_zip = users_df["zip_code"].max()
    users_df["zip_code"] = users_df["zip_code"] / max_zip

    users_df["occupation"] = raw_users_df["occupation"].apply(occupations_dict.get) / len(
        occupations_dict
    )

    return users_df


def calculate_genre_ratio(
    user_id: int, connections_df: pd.DataFrame, items_df: pd.DataFrame
) -> list[float]:
    movies_set = set(
        raw_connections_df[connections_df["user_id"] == user_id]["item_id"].tolist()
    )
    genres_sum = (
        items_df[items_df["movie_id"].isin(movies_set)].iloc[:, 5:].sum(axis=0).to_numpy()
    )
    return genres_sum / genres_sum.sum()


def encode_genres(
    df: pd.DataFrame,
    connections_df: pd.DataFrame,
    items_df: pd.DataFrame,
    genres: list[str],
) -> pd.DataFrame:
    genres_df = pd.DataFrame(
        df["user_id"]
        .apply(lambda x: calculate_genre_ratio(x, connections_df, items_df))
        .tolist(),
        columns=[f"genre_{genre}" for genre in genres],
    )
    return pd.concat([df, genres_df], axis=1)


def calculate_ratings(
    user_id: int, items_length: int, connections_df: pd.DataFrame
) -> list[float]:
    ratings = [0.0 for _ in range(items_length)]
    for _, r in connections_df[connections_df["user_id"] == user_id].iterrows():
        ratings[r["item_id"] - 1] = r["rating"] / 5
    return ratings


def encode_ratings(
    df: pd.DataFrame, connections_df: pd.DataFrame, items_df: pd.DataFrame
) -> pd.DataFrame:
    item_ids = items_df["movie_id"].to_list()
    items_length = len(item_ids)

    ratings_df = pd.DataFrame(
        df["user_id"]
        .apply(lambda x: calculate_ratings(x, items_length, connections_df))
        .tolist(),
        columns=[f"rating_{item_id}" for item_id in item_ids],
    )
    return pd.concat([df, ratings_df], axis=1)


def encode_users(
    df: pd.DataFrame,
    connections_df: pd.DataFrame,
    items_df: pd.DataFrame,
    genres: list[str],
) -> pd.DataFrame:
    users_df = basic_users_encode(df)
    users_df["watched"] = users_df["user_id"].apply(
        lambda x: len(connections_df[connections_df["user_id"] == x])
    )  # for future masking
    users_df = encode_genres(users_df, connections_df, items_df, genres)

    return encode_ratings(users_df, connections_df, items_df)

In [157]:
users_df = encode_users(raw_users_df, raw_connections_df, raw_items_df, genres)
users_df.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,watched,genre_unknown,genre_Action,genre_Adventure,genre_Animation,...,rating_1673,rating_1674,rating_1675,rating_1676,rating_1677,rating_1678,rating_1679,rating_1680,rating_1681,rating_1682
0,1,0.24,1,0.904762,0.569647,272,0.001704,0.127768,0.07155,0.020443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.53,0,0.619048,0.59313,62,0.0,0.081301,0.02439,0.00813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.23,1,0.952381,0.638964,54,0.0,0.114754,0.032787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.24,1,0.904762,0.638066,24,0.0,0.142857,0.071429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.33,0,0.619048,0.592206,175,0.002604,0.145833,0.085938,0.036458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Save data

In [159]:
users_df.to_csv("../data/raw/encoded_users.csv", sep="\t", index=False)