In [None]:
import pandas as pd
import csv
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
movies_df = pd.read_csv('../data/common/ml-25m/movies.csv')
ratings_df = pd.read_csv('../data/common/ml-25m/ratings.csv')

In [None]:
ratings_df.head(3)

In [None]:
grouped = ratings_df.groupby(['movieId']).agg(
    counts=('rating', 'size'),
    average_rating=('rating', 'mean')
)

In [None]:
n_items = 200

frequent = grouped[grouped['counts'] >= 5000].sample(n=n_items, random_state=0)
frequent = frequent.sort_values(by='average_rating', ascending=False)
print(len(frequent))

infrequent = grouped[(grouped['counts'] >= 50) & (grouped['counts'] <= 500)].sample(n=n_items, random_state=0)
infrequent = infrequent.sort_values(by='average_rating', ascending=False)
print(len(infrequent))

In [None]:
all_freq = grouped.sample(n=300, random_state=0)
all_freq = all_freq.sort_values(by='average_rating', ascending=False)

In [None]:
frequent = pd.merge(frequent, movies_df[['movieId', 'title']], on='movieId', how='left')
infrequent = pd.merge(infrequent, movies_df[['movieId', 'title']], on='movieId', how='left')
all_freq = pd.merge(all_freq, movies_df[['movieId', 'title']], on='movieId', how='left')

In [None]:
frequent.head()

In [None]:
plt.figure(figsize=(5, 3))  
plt.plot(range(len(frequent)), frequent['average_rating'], marker='.')  
plt.title('Average ratings of frequent items')

plt.figure(figsize=(5, 3))  
plt.plot(range(len(infrequent)), infrequent['average_rating'], marker='.')  
plt.title('Average ratings of infrequent items')

plt.figure(figsize=(5, 3))  
plt.plot(range(len(all_freq)), all_freq['average_rating'], marker='.')  
plt.title('Average ratings of all items')

In [None]:
save_dir = Path(f'../data/t2_bin_preference/')
save_dir.mkdir(exist_ok=True, parents=True)

frequent.to_csv(save_dir / f'frequent-{len(frequent)}.csv')
infrequent.to_csv(save_dir / f'infrequent-{len(infrequent)}.csv')
all_freq.to_csv(save_dir / f'all-{len(all_freq)}.csv')