In [4]:
import sys
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import lil_matrix

sys.path.append('../../../')
from poe_price.data.access.session import PSQLSession

pd.set_option('display.max_columns', None)

# Trading currencies rates

In [5]:
with PSQLSession('127.0.0.1', 'poe_price', 'fabio', 'password') as session:
    currency = session.query('SELECT * FROM trade_currency;')

currency['rate'] = currency.sell_quantity / currency.price_quantity

In [6]:
OUTLIER_WINDOW = 1    # remove trades rate which value is outside the normal distribution window 
                      # defined as (mean_value +- standard deviation * OUTLIER_WINDOW)

MARKET_HEAD = 20      # select only the top MARKET_HEAD deals for the buyer: those deals most
                      # likely are the most representative and older records are ignored

In [7]:
currencies = set(set(currency.sell_currency.values) & 
                 set(currency.price_currency.values))

c_rates = pd.DataFrame(index=set(currency.price_currency.values), 
                       columns=set(currency.price_currency.values))

In [8]:
for v1 in currencies:
    for v2 in set(currencies - set(v1)):
        
        if v1 == 'chaos' or v2 == 'chaos':
            temp = currency[(currency.sell_currency==v1) & 
                            (currency.price_currency==v2)].copy()
            temp.sort_values('rate', ascending=False, inplace=True)

            rate_std = temp.rate.std()
            rate_mean = temp.rate.mean()
            temp['n_rate'] = temp.rate.apply(lambda y: 
                                             (y - rate_mean) / rate_std)

            f_rate = temp[(temp.n_rate > -OUTLIER_WINDOW) & 
                          (temp.n_rate < OUTLIER_WINDOW)].rate

            c_rates.loc[v1, v2] = round(np.mean(temp.head(MARKET_HEAD).rate), 3)
c_rates

Unnamed: 0,scour,silver,fuse,chrom,gcp,chisel,exa,regal,chance,vaal,chaos,alch,divine,blessed,jew,regret,alt
scour,,,,,,,,,,,3.04,,,,,,
silver,,,,,,,,,,,10.476,,,,,,
fuse,,,,,,,,,,,3.096,,,,,,
chrom,,,,,,,,,,,7.869,,,,,,
gcp,,,,,,,,,,,0.791,,,,,,
chisel,,,,,,,,,,,4.422,,,,,,
exa,,,,,,,,,,,0.009,,,,,,
regal,,,,,,,,,,,3.285,,,,,,
chance,,,,,,,,,,,12.404,,,,,,
vaal,,,,,,,,,,,2.711,,,,,,


In [10]:
c_rates.loc['chaos', :] * c_rates.loc[:, 'chaos']

scour       1.10352
silver     0.963792
fuse          1.161
chrom       1.04658
gcp        0.918351
chisel      1.03475
exa         1.03707
regal      0.890235
chance     0.843472
vaal       0.816011
chaos           NaN
alch        1.09174
divine      1.05624
blessed    0.440262
jew         1.00716
regret     0.948123
alt         3.58145
Name: chaos, dtype: object

# Features extraction

In [11]:
with PSQLSession('localhost', 'poe_price', 'fabio', 'password') as session:
    items = session.query('''
        SELECT * 
        FROM trade_item 
        WHERE category = 'weapons';''')

    items_modifiers = session.query('''
        SELECT tim.* 
        FROM trade_item as ti, trade_item_modifier as tim
        WHERE ti.id = tim.item_id
        AND ti.category = 'weapons';
        ''')
    
    items_properties = session.query('''
        SELECT tip.* 
        FROM trade_item as ti, trade_item_property as tip
        WHERE ti.id = tip.item_id
        AND ti.category = 'weapons';
        ''')
        
    items_sockets = session.query('''
        SELECT tis.* 
        FROM trade_item as ti, trade_item_socket as tis
        WHERE ti.id = tis.item_id
        AND ti.category = 'weapons';
        ''')

In [12]:
items_f = items.copy()

In [13]:
items_f.sample(10)

Unnamed: 0,id,category,corrupted,duplicated,identified,ilvl,influence_crusader,influence_elder,influence_hunter,influence_redeemer,influence_shaper,influence_warlord,league,num_prefixes,num_suffixes,num_veiled_modifiers,price_currency,price_quantity,rarity,requirement_dex,requirement_int,requirement_str,requirement_level,sub_category,synthesised,talisman_tier,date
48232,396789,weapons,,,True,71,,,,,,,Delirium,1.0,1.0,0.0,alch,1,magic,43.0,,151.0,58.0,twoaxe,,,03/29/2020
73744,606524,weapons,True,,True,50,,,,,,,Delirium,2.0,2.0,0.0,chaos,20,rare,26.0,,,64.0,bow,,,03/29/2020
66247,546010,weapons,,,False,67,,,,,,,Delirium,,,0.0,chaos,1,rare,170.0,,,,bow,,,03/29/2020
67671,558177,weapons,,,True,78,,True,,,,,Delirium,2.0,3.0,0.0,exa,1,rare,121.0,,,56.0,bow,,,03/29/2020
51750,426142,weapons,,,True,24,,,,,,,Delirium,1.0,3.0,0.0,alt,1,rare,,83.0,,24.0,wand,,,03/29/2020
57076,471754,weapons,,,True,83,,True,,,,,Delirium,3.0,3.0,0.0,chaos,20,rare,119.0,,119.0,70.0,onesword,,,03/29/2020
13069,113687,weapons,,,True,80,,,,,,,Delirium,3.0,2.0,0.0,chaos,5,rare,,188.0,,59.0,wand,,,03/29/2020
42852,352962,weapons,,,True,75,,,,,,,Delirium,3.0,2.0,0.0,chaos,4,rare,,188.0,,59.0,wand,,,03/29/2020
76512,629459,weapons,,,True,82,,,,,,,Delirium,2.0,3.0,0.0,chaos,15,rare,,237.0,,70.0,wand,,,03/29/2020
45008,370606,weapons,,,True,80,,,,,,,Delirium,2.0,2.0,0.0,chaos,5,rare,,212.0,,62.0,wand,,,03/29/2020


In [14]:
for col_name in ['corrupted', 'duplicated', 'influence_crusader', 'influence_elder',
         'influence_hunter', 'influence_redeemer', 'influence_shaper', 
         'influence_warlord']:
    items_f[col_name].fillna(False, inplace=True)

for col_name in ['num_veiled_modifiers', 'requirement_dex', 'requirement_int', 'requirement_str', 'requirement_level']:
    items_f[col_name].fillna(0, inplace=True)

In [15]:
def map_price(y):
    return c_rates.loc['chaos', currency] * quantity

price = []
for k, v in items_f.iterrows():
    if v.price_currency != 'chaos':
        price.append(c_rates.loc['chaos', v.price_currency] * v.price_quantity)
    else:
        price.append(v.price_quantity)
items_f['price'] = price

In [16]:
%%time
how = 'value'
        
mod_lut = {pid: i for i, pid in enumerate(sorted(set(items_modifiers.modifier_id.values)))}
mod_fm = lil_matrix((items_f.shape[0], len(mod_lut)))
for i, wid in enumerate(items_f.id.values):
    for pid, mod in items_modifiers[items_modifiers.item_id == wid].iterrows():
        values = [v for v in [mod.value0, mod.value1, mod.value2] if ~np.isnan(v)]
        if len(values) > 0 and how == 'value':
            values = np.mean(values)
        elif len(values) == 0 or how == 'flag':
            values = 1
        mod_fm[i, mod_lut[mod.modifier_id]] = values
        
items_f = pd.concat([items_f, pd.DataFrame(mod_fm.todense(), columns=['mod{}'.format(i) for i in range(len(mod_lut))])], axis=1)

CPU times: user 3min 40s, sys: 554 ms, total: 3min 41s
Wall time: 3min 41s


In [17]:
%%time
how = 'value'

prop_lut = {pid: i for i, pid in enumerate(sorted(set(items_properties.property_id.values)))}
prop_fm = lil_matrix((items_f.shape[0], len(prop_lut)))
for i, wid in enumerate(items_f.id.values):
    for pid, prop in items_properties[items_properties.item_id == wid].iterrows():
        values = [v for v in [prop.value0, prop.value1] if ~np.isnan(v)]
        if len(values) > 0 and how == 'value':
            values = np.mean(values)
        elif len(values) == 0 or how == 'flag':
            values = 1
        prop_fm[i, prop_lut[prop.property_id]] = values
        
items_f = pd.concat([items_f, pd.DataFrame(prop_fm.todense(), columns=['prop{}'.format(i) for i in range(len(prop_lut))])], axis=1)

CPU times: user 2min 31s, sys: 236 ms, total: 2min 31s
Wall time: 2min 31s


In [18]:
%%time 
items_f['n_sockets'] = 0
items_f['n_red_sockets'] = 0
items_f['n_green_sockets'] = 0
items_f['n_blue_sockets'] = 0
items_f['n_white_sockets'] = 0
items_f['n_abyss_sockets'] = 0
items_f['n_links'] = 0

sockets = None
for iid in items_f.id.values:
    sockets = items_sockets[items_sockets.item_id == iid]
    scolors = sockets.colour.value_counts()
    items_f.at[k, 'n_sockets'] = sockets.shape[0]
    items_f.at[k, 'n_red_sockets'] = scolors['R'] if 'R' in scolors else 0
    items_f.at[k, 'n_green_sockets'] = scolors['G'] if 'G' in scolors else 0
    items_f.at[k, 'n_blue_sockets'] = scolors['B'] if 'B' in scolors else 0
    items_f.at[k, 'n_white_sockets'] = scolors['W'] if 'W' in scolors else 0
    items_f.at[k, 'n_abyss_sockets'] = scolors['A'] if 'A' in scolors else 0
    items_f.at[k, 'n_links'] = sockets.socket_group.value_counts().iloc[0]

CPU times: user 3min 43s, sys: 128 ms, total: 3min 43s
Wall time: 3min 43s


In [19]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

rarity_le = LabelEncoder()
rarity_le.fit(['normal', 'magic', 'rare'])
items_f.rarity = rarity_le.transform(items_f.rarity)

tf_le = LabelEncoder()
tf_le.fit([True, False])
for col_name in ['corrupted', 'duplicated', 'identified', 'influence_crusader', 'influence_elder',
         'influence_hunter', 'influence_redeemer', 'influence_shaper', 
         'influence_warlord']:
    items_f[col_name] = tf_le.transform(items_f[col_name])

In [20]:
subcategory_ohe = OneHotEncoder(sparse=False)

items_f = pd.concat([items_f, pd.DataFrame(subcategory_ohe.fit_transform(items_f.sub_category.values.reshape(-1,1)), 
             columns=['category{}'.format(i) 
                      for i in range(len(subcategory_ohe.get_feature_names()))])], axis=1)

In [21]:
for col in items_f.columns:
    if len(items_f.loc[:, col].value_counts(dropna=False)) == 1:
        del(items_f[col])
        
items_f.drop(columns=['id', 'category', 'date', 'price_currency', 'price_quantity', 'num_prefixes', 'num_suffixes', 'sub_category'], inplace=True, errors='ignore')

In [22]:
items_f_copy = items_f.copy(deep=True)

In [23]:
items_f = items_f_copy.copy(deep=True)
items_f = items_f[(items_f.price<=1000) & (items_f.price>=1)]

In [24]:
y = items_f['price'].values
X = items_f.loc[:, items_f.columns != 'price'].values

# ML models

In [25]:
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

def print_metrics(y_train, y_train_pred, y_test, y_test_pred):
    print('[R2]\tTrain: {}\tTest: {}'.format(r2_score(y_train, y_train_pred),
                                   r2_score(y_test, y_test_pred)))
    print('[MAE]\tTrain: {}\tTest: {}'.format(mean_absolute_error(y_train, y_train_pred),
                                       mean_absolute_error(y_test, y_test_pred)))

In [26]:
sc_X = StandardScaler()
sc_y = StandardScaler()

X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(-1,1))

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Linear regression

In [29]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

print_metrics(y_train, y_train_pred, y_test, y_test_pred)

[R2]	Train: 0.2929533994836647	Test: -3.7843605865384713e+24
[MAE]	Train: 0.4341323292041097	Test: 15431058863.79042


## backward elimination (r2 score + p value)

In [22]:
regressor_OLS = sm.OLS(y, X).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.295
Model:,OLS,Adj. R-squared (uncentered):,0.291
Method:,Least Squares,F-statistic:,71.62
Date:,"Mon, 04 May 2020",Prob (F-statistic):,0.0
Time:,19:53:27,Log-Likelihood:,-98143.0
No. Observations:,78898,AIC:,197200.0
Df Residuals:,78439,BIC:,201500.0
Df Model:,459,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0250,0.003,7.383,0.000,0.018,0.032
x2,0.0010,0.003,0.346,0.730,-0.005,0.007
x3,-0.0018,0.005,-0.341,0.733,-0.012,0.008
x4,0.0891,0.004,20.984,0.000,0.081,0.097
x5,0.0280,0.005,5.456,0.000,0.018,0.038
x6,0.0472,0.006,8.120,0.000,0.036,0.059
x7,0.0164,0.006,2.870,0.004,0.005,0.028
x8,0.0190,0.005,3.554,0.000,0.009,0.029
x9,0.0356,0.006,6.170,0.000,0.024,0.047

0,1,2,3
Omnibus:,71435.775,Durbin-Watson:,1.752
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3122971.05
Skew:,4.327,Prob(JB):,0.0
Kurtosis:,32.582,Cond. No.,5.6e+16


In [26]:
def backwardElimination(x, sl):
    stats = []
    
    numVars = len(x[0])
    for i in range(0, numVars):
        a = time.time()
        
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    
        b = time.time()            
        print('[{}/{} ({}%) computed in {} seconds]: R2={}, R2_norm={}, Highets P_value:{}'.format(i, numVars, 
            round(i/numVars, 3), int(b-a), round(regressor_OLS.rsquared, 5), round(regressor_OLS.rsquared_adj, 5), 
            round(maxVar, 5)))
        stats.append((regressor_OLS.rsquared, regressor_OLS.rsquared_adj, maxVar))
        
    regressor_OLS.summary()
    return x, stats

SL = 0.05
X_clean, stats = backwardElimination(X, SL)

[0/466 (0.0%) computed in 8 seconds]: R2=0.29533, R2_norm=0.29121, Highets P_value:0.99967
[1/466 (0.002%) computed in 10 seconds]: R2=0.29533, R2_norm=0.29122, Highets P_value:0.97237
[2/466 (0.004%) computed in 18 seconds]: R2=0.29533, R2_norm=0.29122, Highets P_value:0.97177
[3/466 (0.006%) computed in 15 seconds]: R2=0.29533, R2_norm=0.29123, Highets P_value:0.97052
[4/466 (0.009%) computed in 10 seconds]: R2=0.29533, R2_norm=0.29124, Highets P_value:0.96843
[5/466 (0.011%) computed in 11 seconds]: R2=0.29533, R2_norm=0.29125, Highets P_value:0.96519
[6/466 (0.013%) computed in 12 seconds]: R2=0.29533, R2_norm=0.29125, Highets P_value:0.96149
[7/466 (0.015%) computed in 12 seconds]: R2=0.29533, R2_norm=0.29126, Highets P_value:0.95428
[8/466 (0.017%) computed in 10 seconds]: R2=0.29533, R2_norm=0.29127, Highets P_value:0.95247
[9/466 (0.019%) computed in 12 seconds]: R2=0.29533, R2_norm=0.29128, Highets P_value:0.94485
[10/466 (0.021%) computed in 10 seconds]: R2=0.29533, R2_norm=0

## RandomForest with 10 trees

In [30]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train,  y_train.ravel())

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

print_metrics(y_train, y_train_pred, y_test, y_test_pred)

[R2]	Train: 0.8759291161516278	Test: 0.3033642970240694
[MAE]	Train: 0.14629374327204797	Test: 0.37251095701000647


### with KFold train test

In [29]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)

metrics = []
for train_index, test_index in kf.split(X):
    print('.', end='')
    
    X_train, X_test = X_clean[train_index], X_clean[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    regressor.fit(X_train, y_train.ravel())

    y_train_pred = regressor.predict(X_train)
    y_test_pred = regressor.predict(X_test)
    
    metrics.append((r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

print(np.mean([v1 for v1, v2 in metrics]), np.std([v1 for v1, v2 in metrics]))
print(np.mean([v2 for v1, v2 in metrics]), np.std([v2 for v1, v2 in metrics]))

..........0.8798749637390257 0.0019064147262380323
0.3258246892409187 0.03351033651596367


### backward elimination (R2 score + p value)

In [None]:
import statsmodels.formula.api as sm
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x

SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

### RandomForest with 100 trees

In [79]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)
print_metrics(y_train, y_train_pred, y_test, y_test_pred)

  This is separate from the ipykernel package so we can avoid doing imports until


0.2763365525206958

## SVR

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train.ravel())

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)
print_metrics(y_train, y_train_pred, y_test, y_test_pred)

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'poly', degree=2)
regressor.fit(X_train, y_train.ravel())

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)
print_metrics(y_train, y_train_pred, y_test, y_test_pred)

  y = column_or_1d(y, warn=True)
