In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime

In [51]:
# Reading File
rel_data = pd.read_csv("data/UserAnimeList.csv", usecols=["username", "anime_id", "my_score"])

rel_data = rel_data[rel_data["my_score"] != 0.0]

anime_by_watched = rel_data["anime_id"].value_counts()
pop_anime = anime_by_watched[anime_by_watched >= 2000] # only including anime with at least 2000 rated users
rel_data = rel_data[rel_data["anime_id"].isin(pop_anime.index)]
rel_data = rel_data.reset_index(drop=True)

print("Read", len(rel_data.index), "rows")


Read 42830024 rows


In [52]:
unique_users = rel_data["username"].unique()
unique_anime = rel_data["anime_id"].unique()

print("Unique Users:",len(unique_users))
print("Unique Anime:",len(unique_anime))

Unique Users: 271259
Unique Anime: 3418


In [53]:
# Creating map from their id to our id
anime_id_map = {}
for idx in range(len(unique_anime)):
    anime_id_map[unique_anime[idx]] = idx

In [54]:
# Making user vectors
count = 0
all_user_data = []
for user in unique_users:
    start = count
    while (count < len(rel_data.index) and rel_data["username"].iloc[count] == user):
        count += 1
    if (count % 1000000 < 200):
        print("Read", count, "rows", flush=True)
    user_anime = rel_data[start:count][["anime_id", "my_score"]]
    user_data = pd.Series(np.zeros(len(unique_anime)))
    user_data[user_anime["anime_id"].apply(lambda x: anime_id_map[x])] = user_anime["my_score"]
    all_user_data.append(user_data)

Read 53 rows
Read 2000134 rows
Read 3000137 rows
Read 3000152 rows
Read 4000110 rows
Read 6000052 rows
Read 6000125 rows
Read 8000079 rows
Read 9000152 rows
Read 10000147 rows
Read 12000136 rows
Read 12000191 rows
Read 14000010 rows
Read 16000185 rows
Read 17000085 rows
Read 18000155 rows


In [57]:
step = 20000
pd.DataFrame(all_user_data[:step]).to_csv("clean_data/users1.csv", index=False)


In [58]:
pd.DataFrame(all_user_data[step:2 * step]).to_csv("clean_data/users2.csv", index=False)


In [59]:
pd.DataFrame(all_user_data[2 * step:3 * step]).to_csv("clean_data/users3.csv", index=False)


In [60]:
pd.DataFrame(all_user_data[3 * step:4 * step]).to_csv("clean_data/users4.csv", index=False)


In [61]:
np.savetxt("clean_data/anime_id_map_reverse.csv", unique_anime.astype(np.int), delimiter=",")
