In [1]:
import numpy as np
import pandas as pd

from datetime import date

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

## Final Functions
parameters:
- sample size: N
- seed for randomization
- train-test split ratio

In [2]:
## Note: csv_file is not included in the repo

def get_sample(csv_file, N, seed):
    np.random.seed(seed)
    ratings = pd.read_csv(csv_file)
    
    #keep only ratings from the most recent 5 years： 2010-2015
    ratings['year'] = ratings['timestamp'].apply(lambda x: date.fromtimestamp(x).year)
    ratings = ratings[ratings['year']>=date.fromisoformat('2010-01-01').year]
    ratings.drop(['timestamp','year'],axis=1,inplace=True) #'timestamp' and 'year' no longer useful 
    
    #keep only movies with at least 20 ratings
    m_counts = ratings.groupby('movieId').size()
    m_index = m_counts[m_counts>=20].index
    ratings = ratings[ratings['movieId'].isin(m_index)]
    
    #keep only users who have rated at least 20 movies - consistent with original data
    u_counts = ratings.groupby('userId').size()
    u_index = u_counts[u_counts>=20].index
    ratings = ratings[ratings['userId'].isin(u_index)]
    
    #sample out N users
    users = np.unique(ratings['userId'])
    u_index_sample = np.random.choice(users, N)
    sample = ratings[ratings['userId'].isin(u_index_sample)]
    
    #keep a percentage of ratings for each user: 30% -- lower variance
    pct = pd.DataFrame(columns=['userId','movieId','rating'])
    u_N, u_rating_count = np.unique(sample['userId'], return_counts=True)
    
    for i in range(len(u_N)):
        u_id = u_N[i]
        u_sample = sample[sample['userId']==u_id]
        u_sample = u_sample[u_sample['movieId'].isin(np.random.choice(np.unique(u_sample['movieId']),
                                                                      int(u_rating_count[i]*0.3)))]
        pct = pct.append(u_sample)
    
    sample = pct
    
#     #plot data
#     users_s, u_count_s = np.unique(sample['userId'], return_counts=True)
#     plt.plot(np.arange(1, len(users_s)+1), sorted(sample.groupby('userId').size(), reverse=True))
#     plt.xlabel('number of users')
#     plt.ylabel('number of rated movies')
#     plt.show()
    
    print('number of unique users:', len(np.unique(sample['userId'])))
    print('number of unique movies:', len(np.unique(sample['movieId'])))
    
    return sample


In [3]:
sample = get_sample('ratings.csv', 500, 1)

number of unique users: 494
number of unique movies: 4191


In [4]:
sample.head()

Unnamed: 0,userId,movieId,rating
10993,99,1,4.0
10999,99,480,3.0
11011,99,1261,4.0
11015,99,2454,3.0
11020,99,2959,5.0


In [5]:
"""Save Sample Data File for Reference When Needed"""
sample.to_csv('sample_data.csv')

In [6]:
def get_train_test(sample_data, split, seed):
    np.random.seed(seed)
    sample = sample_data
    
    #if starting with csv file
#     sample = pd.read_csv(sample_data)
#     sample.drop(['Unnamed: 0'],axis=1,inplace=True)
    
    train, test = train_test_split(sample, test_size=split)
    
    #check
    print('number of users in train:',len(np.unique(train['userId'])))
    print('number of movies in train:',len(np.unique(train['movieId'])))
    print('number of users in test:', len(np.unique(test['userId'])))
    print('number of movies in test:',len(np.unique(test['movieId'])))
    
    return train, test
    

In [7]:
train, test = get_train_test(sample, 0.3, 1)

number of users in train: 494
number of movies in train: 3599
number of users in test: 469
number of movies in test: 2288


In [8]:
train.head()

Unnamed: 0,userId,movieId,rating
18431191,127563,2571,1.5
8849191,61141,4246,5.0
11824579,81598,6539,5.0
7872978,54261,4973,5.0
6901525,47543,68554,3.0


## Process

In [None]:
ratings = pd.read_csv('ratings.csv')
# ratings.drop(['timestamp'],axis=1,inplace=True)
ratings.head()

In [None]:
ratings['year'] = ratings['timestamp'].apply(lambda x: date.fromtimestamp(x).year)
# ratings.sort_values('datetime', ascending=False, inplace=True)

In [None]:
ratings.head()

In [None]:
ratings.groupby('year').size()

In [None]:
ratings = ratings[ratings['year']>=date.fromisoformat('2010-01-01').year]
print(ratings.shape)
print(ratings.head())
ratings.groupby('year').size()

In [None]:
ratings.drop(['timestamp','year'],axis=1,inplace=True)
ratings.head()

In [None]:
users, u_count = np.unique(ratings_1015['userId'], return_counts=True)
print(len(users))
movies, m_count = np.unique(ratings_1015['movieId'], return_counts=True)
print(len(movies))

In [None]:
#check for movie counts
counts = ratings.groupby('movieId').size()
counts.describe()

In [None]:
#remove movies with less than 20 ratings - too few ratings are meaningless
m_index = counts[counts>=20].index
ratings = ratings[ratings['movieId'].isin(m_index)]
ratings.head()

In [None]:
#check for movie counts
counts = ratings.groupby('movieId').size()
counts.describe()

In [None]:
#check for user counts
ratings.groupby('userId').size().describe()

In [None]:
#mostly above 20 ratings
u_counts = ratings.groupby('userId').size()
u_index = u_counts[u_counts>=20].index
ratings = ratings[ratings['userId'].isin(u_index)]
print(ratings.shape)

In [None]:
#down from 30773 to 29080 - still adequate
users = np.unique(ratings['userId'])
print(len(users))
movies = np.unique(ratings['movieId'])
print(len(movies))

In [None]:
# testing = ratings_1015.drop(['timestamp','datetime'],axis=1)
# testing.head()

In [None]:
np.random.seed(1)
u_index_sample = np.random.choice(users,50)
print(len(u_index_sample))
sample = ratings[ratings['userId'].isin(u_index_sample)]

In [None]:
len(np.unique(sample['userId']))

In [None]:
sample.head()

In [None]:
users_sample, u_count_sample = np.unique(sample['userId'], return_counts=True)
print(len(users_sample))
movies_sample, m_count_sample = np.unique(sample['movieId'], return_counts=True)
print(len(movies_sample))

In [None]:
sample.groupby('userId').count().sort_values('movieId',ascending=False)

In [None]:
plt.plot(np.arange(1, len(users_sample)+1), sorted(sample.groupby('userId').size(), reverse=True))
plt.xlabel('users')
plt.ylabel('number of rated movies')
plt.show()

In [None]:
pct1 = pd.DataFrame(columns=['userId','movieId','rating'])
u_N, u_rating_count = np.unique(sample['userId'], return_counts=True)

In [None]:
for i in range(len(u_N)):
    u_id = u_N[i]
    u_sample = sample[sample['userId']==u_id]
    u_sample = u_sample[u_sample['movieId'].isin(np.random.choice(np.unique(u_sample['movieId']),
                                                                  int(u_rating_count[i]*0.1)))]
    pct1 = pct1.append(u_sample)

In [None]:
print(len(pct['userId'].unique()))
print(len(pct['movieId'].unique()))

In [None]:
#keep 50%
pct.groupby('movieId').size().describe()

In [None]:
#keep 30%
pct3.groupby('movieId').size().describe()

In [None]:
pct3.groupby('userId').size().describe()

In [None]:
#keep 10%
pct1.groupby('movieId').size().describe()

In [None]:
sample.groupby('movieId').size().describe()

### Train Test Split

In [None]:
# _50p = sample.groupby('userId').size().describe()[5]
# _50p
sample.groupby('userId').size().describe()

In [None]:
#no longer needed to set a minimum threshold
# print(len(users3),len(users3[u_count3<_50p]))
# user_train_ind = users3[u_count3<_50p]
# user_train_ind2 = users3[u_count3>=_50p]
# train1 = sample[sample['userId'].isin(user_train_ind)]

In [None]:
# train1

In [None]:
# train, test = train_test_split(sample[sample['userId'].isin(user_train_ind2)], test_size = 0.3)
train, test = train_test_split(sample, test_size = 0.3)

In [None]:
# train = train1.append(train2)

In [None]:
print(len(np.unique(sample['userId'])))
print(len(np.unique(sample['movieId'])))
sample.shape

In [None]:
print(len(np.unique(train['userId'])))
print(len(np.unique(train['movieId'])))
train.shape

In [None]:
print(len(np.unique(test['userId'])))
print(len(np.unique(test['movieId'])))
test.shape