# Train-Valid-Test Split EDA / Sanity Check

In [1]:
# Import libraries
import numpy as np
import pandas as pd

In [2]:
# Load pickled data
train_df = pd.read_pickle("data/train.pkl")
valid_df = pd.read_pickle("data/val.pkl")
test_df = pd.read_pickle("data/test.pkl")

# Assert no users in multiple sets

In [3]:
# Unique users in each set 
train_users = set(train_df['user_id'])
valid_users = set(valid_df['user_id'])
test_users = set(test_df['user_id'])

In [4]:
# Assert no overlap
null_set = set()
assert(train_users.intersection(valid_users) == null_set)
assert(train_users.intersection(test_users) == null_set)
assert(valid_users.intersection(test_users) == null_set)

# Assert no businesses in multiple sets

In [5]:
# Unique businesses in each set
train_bis = set(train_df['business_id'])
valid_bis = set(valid_df['business_id'])
test_bis = set(test_df['business_id'])

In [6]:
# Assert no overlap
null_set = set()
assert(train_bis.intersection(valid_bis) == null_set)
assert(train_bis.intersection(test_bis) == null_set)
assert(valid_bis.intersection(test_bis) == null_set)

# Assert no reviews in multiple sets

In [7]:
# Unique reviews in each set
train_rev = set(train_df['review_id_r'])
valid_rev = set(valid_df['review_id_r'])
test_rev = set(test_df['review_id_r'])

In [8]:
# Assert no overlap
null_set = set()
assert(train_rev.intersection(valid_rev) == null_set)
assert(train_rev.intersection(test_rev) == null_set)
assert(valid_rev.intersection(test_rev) == null_set)

# Summary Statistics

## Number of Users

In [9]:
print("Users in train set: {}".format(len(train_users)))
print("Users in valid set: {}".format(len(valid_users)))
print("Users in test set:  {}".format(len(test_users)))

Users in train set: 284793
Users in valid set: 153642
Users in test set:  269028


## Number of Businesses

In [10]:
print("Businesses in train set: {}".format(len(train_bis)))
print("Businesses in valid set: {}".format(len(valid_bis)))
print("Businesses in test set:  {}".format(len(test_bis)))

Businesses in train set: 39484
Businesses in valid set: 34858
Businesses in test set:  75904


## Number of Reviews

In [11]:
print("Reviews in train set: {}".format(train_df.shape[0]))
print("Reviews in valid set: {}".format(valid_df.shape[0]))
print("Reviews in test set:  {}".format(test_df.shape[0]))

Reviews in train set: 2476843
Reviews in valid set: 296241
Reviews in test set:  328428


## Percentage of 5 Star Reviews

In [12]:
train_p = train_df[train_df['stars_r']==5].shape[0] / train_df.shape[0]
valid_p = valid_df[valid_df['stars_r']==5].shape[0] / valid_df.shape[0]
test_p = test_df[test_df['stars_r']==5].shape[0] / test_df.shape[0]

In [16]:
print("Percentage of 5 star reviews in train set: {:.3f}".format(train_p))
print("Percentage of 5 star reviews in valid set: {:.3f}".format(valid_p))
print("Percentage of 5 star reviews in test set:  {:.3f}".format(test_p))

Percentage of 5 star reviews in train set: 0.383
Percentage of 5 star reviews in valid set: 0.501
Percentage of 5 star reviews in test set:  0.541
