In [10]:
import sys
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import hstack

import configparser

sys.path.append('/home/fabio/Desktop/poe_price')
from poe_price.learn import feature, preprocess
from poe_price.data import select

pd.set_option('display.max_columns', None)

In [2]:
conn_config = configparser.ConfigParser()
conn_config.read('/home/fabio/Desktop/poe_price/data/database/connect.ini')

['/home/fabio/Desktop/poe_price/data/database/connect.ini']

In [14]:
items = select.get_armours(conn_config['postgresql'], currency_types=['chaos', 'exalted'])

currency = select.get_currency(conn_config['postgresql'])

# Feature extraction

In [11]:
X = []

In [12]:
X.append(feature.corrupted(items))
# X.append(feature.duplicated(items))
# X.append(feature.identified(items))
# X.append(feature.requirements(items))
# X.append(feature.veiled(items, how='flag'))
# X.append(feature.influences(items))
X.append(feature.sockets(items))
# X.append(feature.is_abyss_jewel(items))

mods = feature.Modifiers(how='mean')
X.append(mods.fit_transform(items))

props = feature.Properties(how='mean')
X.append(props.fit_transform(items))

# categ = feature.Category()
# X.append(categ.fit_transform(items))

# subcateg = feature.SubCategories()
# X.append(subcateg.fit_transform(items))

rarity = feature.Rarity()
X.append(rarity.fit_transform(items))

KeyboardInterrupt: 

In [7]:
len(X), [v.shape for v in X]

(6,
 [(66144, 1), (66144, 1), (66144, 500), (66144, 7), (66144, 14), (66144, 2)])

In [8]:
X = hstack(X)
X.shape, items['trade_item'].shape

((66144, 525), (66144, 28))

In [9]:
price_transformer = feature.Price()

y = price_transformer.fit_transform(items, currency)

# Preprocessing

In [16]:
# coo_matrix -> csr_matrix
X = preprocess.remove_empty_features(X.tocsr())
X.shape

(65279, 514)

In [17]:
X = X.toarray()

In [18]:
X, y = preprocess.price_bounds_row_removal(X, y, 0, 1000)
X.shape

(65279, 514)

In [19]:
from sklearn.preprocessing import MaxAbsScaler

sc_X = MaxAbsScaler()
sc_y = MaxAbsScaler()

X_scaled = sc_X.fit_transform(X)
y_scaled = sc_y.fit_transform(y)

In [21]:
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA

# tsvd = TruncatedSVD(10, n_iter=10, random_state=42)
# X_reduced = tsvd.fit_transform(X_scaled)

# pca = PCA(10, random_state=42)
# X_reduced = pca.fit_transform(X_scaled)

kpca = KernelPCA(10, kernel='poly', degree=3, random_state=42, n_jobs=2)
X_reduced = kpca.fit_transform(X_scaled)

### backward elimination (r2 score + p value)

In [23]:
import statsmodels.api as sm
regressor_OLS = sm.OLS(y_scaled, X_reduced).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.043
Model:,OLS,Adj. R-squared (uncentered):,0.042
Method:,Least Squares,F-statistic:,290.4
Date:,"Mon, 03 Aug 2020",Prob (F-statistic):,0.0
Time:,18:32:18,Log-Likelihood:,56006.0
No. Observations:,65279,AIC:,-112000.0
Df Residuals:,65269,BIC:,-111900.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0090,0.000,19.792,0.000,0.008,0.010
x2,-0.0088,0.001,-13.134,0.000,-0.010,-0.007
x3,0.0079,0.001,10.220,0.000,0.006,0.009
x4,-0.0206,0.001,-21.572,0.000,-0.022,-0.019
x5,0.0329,0.001,31.998,0.000,0.031,0.035
x6,0.0147,0.001,13.963,0.000,0.013,0.017
x7,0.0179,0.001,16.817,0.000,0.016,0.020
x8,0.0037,0.001,3.473,0.001,0.002,0.006
x9,-0.0091,0.001,-8.329,0.000,-0.011,-0.007

0,1,2,3
Omnibus:,62768.312,Durbin-Watson:,1.574
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2582348.048
Skew:,4.819,Prob(JB):,0.0
Kurtosis:,32.267,Cond. No.,2.4


In [24]:
def backwardElimination(x, sl):
    stats = []
    
    numVars = len(x[0])
    for i in range(0, numVars):
        a = time.time()
        
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    
        b = time.time()            
        print('[{}/{} ({}%) computed in {} seconds]: R2={}, R2_norm={}, Highets P_value:{}'.format(i, numVars, 
            round(i/numVars, 3), int(b-a), round(regressor_OLS.rsquared, 5), round(regressor_OLS.rsquared_adj, 5), 
            round(maxVar, 5)))
        stats.append((regressor_OLS.rsquared, regressor_OLS.rsquared_adj, maxVar))
        
    regressor_OLS.summary()
    return x, stats

SL = 0.05
X_clean, stats = backwardElimination(X_reduced, SL)
X_clean.shape

[0/10 (0.0%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[1/10 (0.1%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[2/10 (0.2%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[3/10 (0.3%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[4/10 (0.4%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[5/10 (0.5%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[6/10 (0.6%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[7/10 (0.7%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[8/10 (0.8%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052
[9/10 (0.9%) computed in 0 seconds]: R2=0.0426, R2_norm=0.04245, Highets P_value:0.00052


(65279, 10)

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_scaled, test_size = 0.1, random_state = 0)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((58751, 10), (6528, 10), (58751, 1), (6528, 1))

# ML models

In [27]:
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold

def print_metrics(y_train, y_train_pred, y_test, y_test_pred):
    print('[R2]\tTrain: {}\tTest: {}'.format(r2_score(y_train, y_train_pred),
                                   r2_score(y_test, y_test_pred)))
    print('[MAE]\tTrain: {}\tTest: {}'.format(mean_absolute_error(y_train, y_train_pred),
                                       mean_absolute_error(y_test, y_test_pred)))

## Linear regression

In [28]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

print_metrics(y_train, y_train_pred, y_test, y_test_pred)

[R2]	Train: 0.050974980266399794	Test: 0.038847520060119045
[MAE]	Train: 0.04723142594762414	Test: 0.04593647070547255


## RandomForest with 10 trees

In [29]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train,  y_train.ravel())

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

print_metrics(y_train, y_train_pred, y_test, y_test_pred)

[R2]	Train: 0.8004794818290281	Test: -0.06560474575945618
[MAE]	Train: 0.01994027602274763	Test: 0.04912600336032342


### RandomForest with 100 trees

In [38]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)
print_metrics(y_train, y_train_pred, y_test, y_test_pred)

  This is separate from the ipykernel package so we can avoid doing imports until


KeyboardInterrupt: 

## SVR

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train.ravel())

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)
print_metrics(y_train, y_train_pred, y_test, y_test_pred)

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'poly', degree=2)
regressor.fit(X_train, y_train.ravel())

y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)
print_metrics(y_train, y_train_pred, y_test, y_test_pred)

## Polinomial regression

In [30]:
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(degree=4)
X_train_poly = pf.fit_transform(X_train)
X_test_poly = pf.transform(X_test)

In [31]:
X_train_poly.shape

(58751, 1001)

In [32]:
regressor = LinearRegression()
regressor.fit(X_train_poly, y_train)

y_train_pred = regressor.predict(X_train_poly)
y_test_pred = regressor.predict(X_test_poly)

print_metrics(y_train, y_train_pred, y_test, y_test_pred)

[R2]	Train: 0.12920403878340747	Test: 0.08419958014603568
[MAE]	Train: 0.04532859504665678	Test: 0.04497543963732655
