In [14]:
import json
import pandas as pd

def filter_by_freq(df: pd.DataFrame, column: str, min_freq: int) -> pd.DataFrame:
    freq = df[column].value_counts()
    frequent_values = freq[freq > min_freq].index
    return df[df[column].isin(frequent_values)]
def create_id_map(sampled_ids):
    sampled_ids = sorted(sampled_ids)
    id_map = {x:i for i,x in enumerate(sampled_ids)}
    return id_map
def print_sample_summary(df, user_col_name='userId', item_col_name='movieId'):
    n_unique_users = len(set(df[user_col_name]))
    n_unique_items = len(set(df[item_col_name]))
    n_ratings = len(df)
    print(n_ratings, " ratings, ", n_unique_users, " users, ", n_unique_items, " items")
    print("Sparsity: ", n_ratings/(n_unique_users*n_unique_items))

lines = []
for line in open('../datasets/reviews.clean.json').readlines():
    lines.append(eval(line))

In [8]:
df = pd.DataFrame(lines)
df = df.drop(['reviewerName', 'reviewText', 'categories', 'reviewTime'], axis=1)
df.to_csv('../dataset/gl_df.csv')

In [10]:
df = pd.read_csv('../datasets/gl_df.csv')

In [11]:
print_sample_summary(df, 'gPlusUserId', 'gPlusPlaceId')

11453845  ratings,  5054567  users,  3116785  items
Sparsity:  7.270436722548411e-07


In [12]:
filtered_df = filter_by_freq(df, 'gPlusUserId', 150)
print_sample_summary(filtered_df, 'gPlusUserId', 'gPlusPlaceId')

438351  ratings,  1571  users,  275402  items
Sparsity:  0.0010131616130746037


In [13]:
import numpy as np
sampled_uids = list(set(filtered_df['gPlusUserId']))
sampled_uids = np.random.choice(sampled_uids, 1000, replace=False)
filtered_df = filtered_df[filtered_df['gPlusUserId'].isin(sampled_uids)]
sampled_mids = list(set(filtered_df['gPlusUserId']))

uid_map = create_id_map(sampled_uids)
mid_map = create_id_map(sampled_mids)
filtered_df['uid'] = filtered_df['gPlusUserId'].map(uid_map)
filtered_df['mid'] = filtered_df['gPlusPlaceId'].map(mid_map)
print_sample_summary(filtered_df, 'gPlusUserId', 'gPlusPlaceId')

275804  ratings,  1000  users,  189569  items
Sparsity:  0.0014549003265301817


In [None]:
from sklearn.model_selection import train_test_split

filtered_df.to_csv('../datasets/gl/u.data')


n_runs = 5
for i in range(n_runs):
    X = filtered_df.copy()
    X = X.sample(frac=1)
    uids = X.copy().pop('uid').to_frame()
    X_train, X_test, y_train, y_test = train_test_split(X, uids,stratify=uids, test_size=0.2)
    X_train['uid'] = y_train.values.squeeze()
    X_test['uid'] = y_test.values.squeeze()
    X_train.to_csv('../datasets/gl/u' + str(i) + '.base')
    X_test.to_csv('../datasets/gl/u' + str(i) + '.test')