## Data Exploration

In [None]:
import gzip
import string
from collections import defaultdict
import numpy as np

In [None]:
def preprocess(x):
    x = x.replace('\t', ' ')
    x = x.replace('\n', ' ')
    x = x.translate(str.maketrans('', '', string.punctuation)).lower()
    return x.split(' ')

In [None]:
path = 'data/renttherunway_final_data.json.gz'

userCount = defaultdict(int)
itemCount = defaultdict(int)
reviewLengths = []
fitCount = defaultdict(int)
ratingCount = defaultdict(int)

with gzip.open(path) as f:
    for line in f:
        try:
            line = eval(line)
        except:
            continue
        user = line['user_id']
        item = line['item_id']
        rlen = len(preprocess(line['review_text'])) + len(preprocess(line['review_summary']))
        userCount[user] += 1
        itemCount[item] += 1
        reviewLengths.append(rlen)
        fitCount[line['fit']] += 1
        rating = int(line['rating'])
        ratingCount[rating] += 1

In [None]:
lengthCount = defaultdict(int)
for rlen in reviewLengths:
    lengthCount[rlen] += 1
lengthCount = list(lengthCount.items())
lengthCount.sort(key=lambda x: x[0])

In [None]:
import matplotlib.pyplot as plt
plt.bar([x[0] for x in lengthCount if x[0] < 200], [x[1] for x in lengthCount if x[0] < 200])
plt.xlabel('Number of words in review')
plt.ylabel('Number of reviews')

In [None]:
userDist = defaultdict(int)
for u, c in userCount.items():
    userDist[c] += 1

userDist = list(userDist.items())
userDist.sort(key=lambda x: x[0])

plt.bar([x[0] for x in userDist if x[0] < 8], [x[1] for x in userDist if x[0] < 8])
plt.xlabel('Number of reviews per user')
plt.ylabel('Number of users')
np.percentile([x[1] for x in userCount.items()], [25, 50, 75])

In [None]:
itemDist = defaultdict(int)
for u, c in itemCount.items():
    itemDist[c] += 1

itemDist = list(itemDist.items())
itemDist.sort(key=lambda x: x[0])

plt.bar([x[0] for x in itemDist if x[0] < 100], [x[1] for x in itemDist if x[0] < 100])
plt.xlabel('Number of reviews per item')
plt.ylabel('Number of item')
np.percentile([x[1] for x in itemCount.items()], [25, 50, 75])

In [None]:
fitCount = fitCount.items()
plt.bar([x[0] for x in fitCount], [x[1] for x in fitCount])
plt.ylabel('Number of reviews')

In [None]:
ratingCount = ratingCount.items()
plt.bar([x[0] for x in ratingCount], [x[1] for x in ratingCount])
plt.xlabel('Rating')
plt.ylabel('Number of reviews')

In [None]:
print(f"Number of users: {len(userCount)}")
print(f"Number of items: {len(itemCount)}")
print(f"Number of reivews: {len(reviewLengths)}")
np.percentile(reviewLengths, [25, 50, 75])