# Data Preprocessing

In [2]:
import pandas as pd
import json
import random

In [60]:
df = pd.read_csv('../data/reddit_ratings.csv', header=None)
df.columns = ['username', 'subreddit', 'utc']
df.head()

Unnamed: 0,username,subreddit,utc
0,plzstoptalkingg,assholedesign,1562740000.0
1,plzstoptalkingg,BrandNewSentence,1562711000.0
2,plzstoptalkingg,goodomens,1562644000.0
3,plzstoptalkingg,goodomens,1562636000.0
4,plzstoptalkingg,relationships,1562629000.0


In [61]:
#limit each user to 1000 comments
df = df.groupby('username', group_keys=False).apply(lambda x: x[:1000])

#only count user interactions on unique subreddits
df = df.drop_duplicates(subset=['username', 'subreddit'])

#only count users that have interacted with at least 10 subreddits
df = df.groupby('username').filter(lambda x: len(x) >= 10)

df['username'] = df['username'].str.lower()
df['subreddit'] = df['subreddit'].str.lower()

In [76]:
df['username'] = df['username'].astype('category')
df['subreddit'] = df['subreddit'].astype('category')

In [77]:
username_categories = df['username'].cat.categories
user_dict = {i: cat for i, cat in enumerate(username_categories)}
inv_user_dict = {cat: i for i, cat in enumerate(username_categories)}

subreddit_categories = df['subreddit'].cat.categories
subreddit_dict = {i: cat for i, cat in enumerate(subreddit_categories)}
inv_subreddit_dict = {cat: i for i, cat in enumerate(subreddit_categories)}

In [78]:
with open('../data/user10.json', 'w') as f:
    json.dump([user_dict, inv_user_dict], f)
    
with open('../data/subreddit10.json', 'w') as f:
    json.dump([subreddit_dict, inv_subreddit_dict], f)

In [79]:
df['username'] = df['username'].cat.codes
df['subreddit'] = df['subreddit'].cat.codes

In [80]:
df_train = df.groupby('username', group_keys=False).apply(lambda group: group.iloc[1:])
df_test_positive = df.groupby('username', group_keys=False).first().reset_index()

In [81]:
df_test_negative = pd.DataFrame(columns=['user_item'] + ['negativeItemID' + str(i) for i in range(1,100)])
subreddits = range(df['subreddit'].nunique())

for i, row in df_test_positive.iterrows():
    if i%500 == 0:
        print('Processing: ' + str(i))
    username = row['username']
    subreddit = row['subreddit']
    rating = (username, subreddit)
    user_subreddits = df[df['username'] == username]['subreddit'].unique()
    user_neg_subreddits = [sub for sub in subreddits if sub not in user_subreddits]
    sampled_neg_subreddits = random.sample(user_neg_subreddits, k=99)
    df_test_negative.loc[len(df_test_negative)] = [rating] + sampled_neg_subreddits

Processing: 0
Processing: 500
Processing: 1000
Processing: 1500
Processing: 2000
Processing: 2500
Processing: 3000
Processing: 3500
Processing: 4000
Processing: 4500
Processing: 5000
Processing: 5500
Processing: 6000
Processing: 6500
Processing: 7000
Processing: 7500
Processing: 8000
Processing: 8500
Processing: 9000
Processing: 9500
Processing: 10000
Processing: 10500
Processing: 11000
Processing: 11500
Processing: 12000
Processing: 12500
Processing: 13000
Processing: 13500
Processing: 14000
Processing: 14500
Processing: 15000
Processing: 15500
Processing: 16000
Processing: 16500
Processing: 17000
Processing: 17500
Processing: 18000
Processing: 18500
Processing: 19000
Processing: 19500
Processing: 20000
Processing: 20500
Processing: 21000
Processing: 21500
Processing: 22000
Processing: 22500
Processing: 23000
Processing: 23500
Processing: 24000
Processing: 24500
Processing: 25000
Processing: 25500
Processing: 26000
Processing: 26500
Processing: 27000
Processing: 27500
Processing: 2800

In [89]:
df_train.to_csv('../data/reddit_train_10.csv', index=False, header=False)
df_test_positive.to_csv('../data/reddit_test_positive_10.csv', index=False, header=False)
df_test_negative.to_csv('../data/reddit_test_negative_10.csv', index=False, header=False)