# 5.1 Predictive Users: Date Preparation

In [1]:
from collections import defaultdict as dd
from datetime import datetime as dt
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import random

In [2]:
DIR_PREP = '../data/prepared/'
DIR_SAMPLE = '../data/samples/'
PATH_REVIEWS = DIR_PREP + 'reviews/%d.csv'
PATH_LANGS = DIR_PREP + 'reviews/languages/%d.csv'
PATH_REVUSERS = DIR_SAMPLE + 'review_users/%s.csv'
REVIEW_PAGES = 32
LANGS_PAGES = 21

In [3]:
def load_reviews():
    df = pd.concat((
        pd.read_csv(PATH_REVIEWS % page_num, header=None)
        for page_num in range(1, REVIEW_PAGES + 1)
    )).reset_index(drop=True)
    df.columns = [
        'uid','gid','polarity','ea','pt_forever','pt_review',
        'ts_created','ts_updated','votes_up','votes_funny','text'
    ]
    return df

def load_reviews_lang():
    df = pd.concat((
        pd.concat((
            pd.read_csv(PATH_REVIEWS % page_num, header=None),
            pd.read_csv(PATH_LANGS % page_num, header=None)
        ), axis=1)
        for page_num in range(1, LANGS_PAGES + 1)
    )).reset_index(drop=True)
    df.columns = [
        'uid','gid','polarity','ea','pt_forever','pt_review',
        'ts_created','ts_updated','votes_up','votes_funny','text',
        'lang','confidence','nlp_words','raw_words','raw_chars'
    ]
    return df

In [12]:
def filter_reviews(df, is_eng=True, min_game_reviews=50, min_user_reviews=10, min_conf=0.5):
    df = df[df['ea'] == 0]
    df = df[df['text'] != '(Review text hidden)']
    df = df[df['nlp_words'] >= 1]
    if is_eng: df = df[(df['lang'] == 'en') & (df['confidence'] >= min_conf)]
    df = df[df.groupby('gid')['gid'].transform('size') >= min_game_reviews]
    df = df[df.groupby('uid')['uid'].transform('size') >= min_user_reviews]
    return df

In [20]:
def mean_to_class_6(x):
    if x < 0.2: return 0
    elif x < 0.4: return 1
    elif x < 0.7: return 2
    elif x < 0.8: return 3
    elif x < 0.95: return 4
    else: return 5

def mean_to_class_3(x):
    if x < 0.4: return 0
    elif x < 0.7: return 1
    else: return 2

In [21]:
def prepare_data(df, sample_name, num_labels=7, N=100000, train_size=0.8, test_size=0.5, seed=None):
    df['gid_mean'] = df.groupby('gid')['polarity'].transform('mean')
    class_function = mean_to_class_6 if num_labels == 6 else mean_to_class_3
    df['gid_mean_class'] = df['gid_mean'].apply(class_function)
    uids = df['uid'].unique().tolist()
    random.seed(seed)
    random.shuffle(uids)
    uids = uids[:int(N / (len(df.index) / len(uids)))]
    train_uids, rem_uids = train_test_split(uids, train_size=train_size, random_state=seed)
    val_uids, test_uids = train_test_split(rem_uids, test_size=test_size, random_state=seed)
    df[df['uid'].isin(train_uids)][
        ['uid','gid','gid_mean','gid_mean_class','pt_review','text']].to_csv(
        PATH_REVUSERS % f'{sample_name}_{num_labels}_train', index=False)
    df[df['uid'].isin(val_uids)][[
        'uid','gid','gid_mean','gid_mean_class','pt_review','text']].to_csv(
        PATH_REVUSERS % f'{sample_name}_{num_labels}_val', index=False)
    df[df['uid'].isin(test_uids)][
        ['uid','gid','gid_mean','gid_mean_class','pt_review','text']].to_csv(
        PATH_REVUSERS % f'{sample_name}_{num_labels}_test', index=False)

In [7]:
df_lang = load_reviews_lang()

In [13]:
df_eng = filter_reviews(df_lang, is_eng=True)
df_neng = filter_reviews(df_lang, is_eng=False)

In [22]:
prepare_data(df_eng, 'eng_160k', num_labels=3, N=160000, seed=1337)
prepare_data(df_eng, 'eng_160k', num_labels=6, N=160000, seed=1337)
prepare_data(df_neng, 'any_160k', num_labels=3, N=160000, seed=1337)
prepare_data(df_neng, 'any_160k', num_labels=6, N=160000, seed=1337)