# 4.1 Preparing Data Samples

## Helpers

In [79]:
import sys, time
_st = None
def timeit():
    global _st
    if _st is None:
        _st = time.time()
    else:
        print(int(time.time() - _st))
        _st = None

## Loading Data

In [74]:
from datetime import datetime as dt
import pandas as pd

In [76]:
DIR_PREP = '../data/prepared/'
PATH_REVIEWS = DIR_PREP + 'reviews/%d.csv'
PATH_LANGS = DIR_PREP + 'reviews/languages/%d.csv'
REVIEW_PAGES = 32
LANGS_PAGES = 21

In [146]:
def load_reviews():
    df = pd.concat((
        pd.read_csv(PATH_REVIEWS % page_num, header=None)
        for page_num in range(1, REVIEW_PAGES + 1)
    )).reset_index(drop=True)
    df.columns = [
        'uid','gid','polarity','ea','pt_forever','pt_review',
        'ts_created','ts_updated','votes_up','votes_funny','text'
    ]
    return df

def load_reviews_lang():
    df = pd.concat((
        pd.concat((
            pd.read_csv(PATH_REVIEWS % page_num, header=None),
            pd.read_csv(PATH_LANGS % page_num, header=None)
        ), axis=1)
        for page_num in range(1, LANGS_PAGES + 1)
    )).reset_index(drop=True)
    df.columns = [
        'uid','gid','polarity','ea','pt_forever','pt_review',
        'ts_created','ts_updated','votes_up','votes_funny','text',
        'lang','confidence','nlp_words','raw_words','raw_chars'
    ]
    return df

In [80]:
df_all = load_reviews()
df_lang = load_reviews_lang()

## Sample Data

In [82]:
DIR_SAMPLE = '../data/samples/'
PATH_REVSENTS = DIR_SAMPLE + 'review_sentiment/%s.csv'

In [235]:
def prepare_review_sentiment_data(df, N=10000, min_conf=0.7, long_words=50):
    is_valid = df['nlp_words'] >= 1
    is_english = (df['lang'] == 'en') & (df['confidence'] >= min_conf)
    is_long = df['nlp_words'] >= long_words
    # English only, equal polarities, any length
    df[is_valid & is_english].groupby('polarity').sample(n=(N // 2)).to_csv(PATH_REVSENTS % f'eng_eq_any_{N}', index=False)
    # English only, any polarity, any length
    df[is_valid & is_english].sample(n=N).to_csv(PATH_REVSENTS % f'eng_any_any_{N}', index=False)
    # English only, equal polarities, short
    df[is_valid & is_english & ~is_long].groupby('polarity').sample(n=(N // 2)).to_csv(PATH_REVSENTS % f'eng_eq_short_{N}', index=False)
    # English only, any polarity, short
    df[is_valid & is_english & ~is_long].sample(n=N).to_csv(PATH_REVSENTS % f'eng_any_short_{N}', index=False)
    # English only, equal polarities, long
    df[is_valid & is_english & is_long].groupby('polarity').sample(n=(N // 2)).to_csv(PATH_REVSENTS % f'eng_eq_long_{N}', index=False)
    # English only, any polarity, long
    df[is_valid & is_english & is_long].sample(n=N).to_csv(PATH_REVSENTS % f'eng_any_long_{N}', index=False)
    # Any language, equal polarities, any length
    df[is_valid].groupby('polarity').sample(n=(N // 2)).to_csv(PATH_REVSENTS % f'any_eq_any_{N}', index=False)
    # Any language, any polarity, any length
    df[is_valid].sample(n=N).to_csv(PATH_REVSENTS % f'any_any_any_{N}', index=False)
    # Any language, equal polarities, short
    df[is_valid & ~is_long].groupby('polarity').sample(n=(N // 2)).to_csv(PATH_REVSENTS % f'any_eq_short_{N}', index=False)
    # Any language, any polarity, short
    df[is_valid & ~is_long].sample(n=N).to_csv(PATH_REVSENTS % f'any_any_short_{N}', index=False)
    # Any language, equal polarities, long
    df[is_valid & is_long].groupby('polarity').sample(n=(N // 2)).to_csv(PATH_REVSENTS % f'any_eq_long_{N}', index=False)
    # Any language, any polarity, long
    df[is_valid & is_long].sample(n=N).to_csv(PATH_REVSENTS % f'any_any_long_{N}', index=False)

In [236]:
prepare_review_sentiment_data(df_lang, N=100000)

## Filter Stuff (use later)

In [45]:
def filter_early_access(df):
    return df[df['ea'] == 0]

def filter_small_games(df, G=100):
    return df[df.groupby('gid')['gid'].transform('size') >= G]

def filter_old_games(df, Y=2014):
    T = dt(Y, 1, 1, 0, 0).timestamp()
    return df[df.groupby('gid')['ts_created'].transform('min') >= T]

In [200]:
df_f = filter_early_access(df_all)
df_f = filter_small_games(df_f, G=300)
df_f = filter_old_games(df_f, Y=2014)

In [201]:
df_f['gid_tsu_min'] = df_f.groupby('gid')['ts_review'].transform('min')
df_f['gid_tsu_max'] = df_f.groupby('gid')['ts_review'].transform('max')

In [234]:
month_seconds = 30 * 24 * 60 * 60

In [202]:
df_f['fm_mean'] = df_f[df_f['ts_review'] < (df_f['gid_tsu_min'] + month_seconds)].groupby('gid')['polarity'].transform('mean')
df_f['lm_mean'] = df_f[df_f['ts_review'] > (df_f['gid_tsu_max'] - month_seconds)].groupby('gid')['polarity'].transform('mean')

In [203]:
df_f['fm_count'] = df_f[df_f['ts_review'] < (df_f['gid_tsu_min'] + month_seconds)].groupby('gid')['polarity'].transform('count')
df_f['lm_count'] = df_f[df_f['ts_review'] > (df_f['gid_tsu_max'] - month_seconds)].groupby('gid')['polarity'].transform('count')

In [205]:
df_f['fm_mean'] = df_f.groupby('gid')['fm_mean'].transform('max')
df_f['lm_mean'] = df_f.groupby('gid')['lm_mean'].transform('max')
df_f['fm_count'] = df_f.groupby('gid')['fm_count'].transform('max')
df_f['lm_count'] = df_f.groupby('gid')['lm_count'].transform('max')

In [214]:
df_gid = df_f.groupby('gid').aggregate(func='max')[['fm_mean','lm_mean','fm_count','lm_count']]

In [225]:
df_gid['mean_diff'] = abs(df_gid['fm_mean'] - df_gid['lm_mean'])

In [233]:
print(df_gid['mean_diff'].mean())
print(df_gid[(df_gid['fm_count']>5)&(df_gid['lm_count']>5)]['mean_diff'].mean())
print(df_gid[(df_gid['fm_count']>10)&(df_gid['lm_count']>10)]['mean_diff'].mean())
print(df_gid[(df_gid['fm_count']>100)&(df_gid['lm_count']>100)]['mean_diff'].mean())

0.20070359753134023
0.12173734958015847
0.0969595773899169
0.08535472004851985


In [226]:
df_gid_slice = df_gid[(df_gid['fm_count']>10)&(df_gid['lm_count']>100)]

In [223]:
df_gid_slice['fm_mean'].corr(df_gid_slice['lm_mean'])

0.558903338968747

In [229]:
df_gid_slice['mean_diff'].mean()

0.0969595773899169