The goal of this notebook is to understand the original data structure and define basic operations.

The source data explored here has been previously downloaded from Path of Exile stash tabs API to local .txt files. Each file contained in a specific folder represent a single API call to http://www.pathofexile.com/api/public-stash-tabs?id= with a proper **nextChangeId**: this id is iteratively applyed on the next API call.

# TODO

In [1]:
import os
import re
import json
import random

import pandas as pd

In [2]:
pd.set_option('display.max_columns', 500)

DATA_PATH = '../../data/temp/stashes_delirium/'

LEAGUE = 'Metamorph'

CURRENCY_KEY = ['alt', 'fuse', 'alch', 'chaos', 'gcp', 'exa', 'chrom', 'jew', 'chance', 'chisel', 'scour', 'blessed', 'regret', 'regal', 'divine', 'vaal', 'silver']
CURRENCY_NAME = ["Orb of Alteration", "Orb of Fusing", "Orb of Alchemy", "Chaos Orb", "Gemcutter's Prism", "Exalted Orb", "Chromatic Orb", "Jeweller's Orb", "Orb of Chance", "Cartographer's Chisel", "Orb of Scouring", "Blessed Orb", "Orb of Regret", "Regal Orb", "Divine Orb", "Vaal Orb", "Silver Coin"]

ITEMS_CATEGORIES = ['accessories', 'armour', 'jewels', 'weapons', 'currency']

In [3]:
def get_stashes_dict(path):
    with open(path, 'rb') as file:
        try:
            return json.load(file)['stashes']
        except:
            print('ERROR: ', path)
            return []
        
        
def get_stashes_items(verbose=False):
    values = list()
    
    # select a fixed number of files
    for v in random.sample(os.listdir(DATA_PATH), 50):
#     for v in os.listdir(DATA_PATH):
        for stash in get_stashes_dict(DATA_PATH + v):
            if len(stash['items']) > 0 and stash['public']:
                # add stash related data to single items
                for item in stash['items']:
                    item['stash_note'] = stash['stash']
                    item['league'] = stash['league']
                values.extend(stash['items'])
    return values

def get_string_price(string):
    if re.match('(~price|~b/o)\s\d+((\.|/)\d+)?\s\w+', string) is not None:
        tokens = string.split(' ')
        
        currency, quantity = tokens[2], int(tokens[1])
        
        # check if currency and quantity are allowed values
        if currency in CURRENCY_KEY:
            return currency, quantity
    raise ValueError()
    
def delete_df_columns(df, *col_name, verbose=False):
    missing = list()
    for v in col_name:
        try:
            del(df[v])
        except:
            missing.append(v)

    if len(missing) > 0 and verbose:
        print('Missing columns: {}'.format(missing))

# Data exploration

In [4]:
items = pd.DataFrame(get_stashes_items())

# alphabetically order dataframe columns
items = items[sorted(items.columns)]

items.shape

(254858, 54)

In [5]:
# extract items category and subcategory
items['subCategory'] = items.extended.apply(lambda y: ' '.join(y['subcategories']) if 'subcategories' in y else None)
items['category'] = items.extended.apply(lambda y: y['category'])

# drop unwanted categories
items = items[items.category.isin(ITEMS_CATEGORIES)]

In [6]:
# extract items price and drop unpriced items
price = []
for k, v in items.iterrows():
    try:
        # item has a valid price
        item_price = get_string_price(v.note)
    except:
        try:
            # stash has a valid price
            item_price = get_string_price(v.stash_note)
        except:
            # both item and stash have no valid price
            item_price = (None, None)
    price.append(item_price)
(items['priceCurrency'], items['priceQuantity']) = zip(*price)
    
# drop unpriced items
items.drop(index = items[((items.priceCurrency.isna()) & (items.priceQuantity.isna())) | (items.priceQuantity == 0)].index, inplace=True)

In [47]:
target = 'extended'

print(len(items[items[target].notna()]))

display(items[items[target].notna()][target].sample(10))
print(items[items[target].notna()][target].value_counts())

print(items[items[target].notna()].category.value_counts())

print(items[items[target].notna()].influences.value_counts())

108029


154732    {'category': 'accessories', 'subcategories': [...
177201    {'category': 'jewels', 'baseType': 'Cobalt Jew...
39306     {'category': 'accessories', 'subcategories': [...
115083    {'category': 'armour', 'subcategories': ['boot...
186165    {'category': 'armour', 'subcategories': ['shie...
128166    {'category': 'jewels', 'subcategories': ['clus...
14131     {'category': 'jewels', 'subcategories': ['clus...
187981    {'category': 'armour', 'subcategories': ['boot...
167093    {'category': 'jewels', 'prefixes': 1, 'suffixe...
179941    {'category': 'accessories', 'subcategories': [...
Name: extended, dtype: object

{'category': 'jewels', 'prefixes': 2, 'suffixes': 2, 'baseType': 'Crimson Jewel'}                                        1481
{'category': 'jewels', 'subcategories': ['cluster'], 'prefixes': 2, 'suffixes': 2, 'baseType': 'Large Cluster Jewel'}    1470
{'category': 'jewels', 'prefixes': 2, 'suffixes': 2, 'baseType': 'Cobalt Jewel'}                                         1455
{'category': 'jewels', 'prefixes': 2, 'suffixes': 2, 'baseType': 'Viridian Jewel'}                                       1407
{'category': 'jewels', 'subcategories': ['cluster'], 'prefixes': 2, 'suffixes': 1, 'baseType': 'Large Cluster Jewel'}    1276
                                                                                                                         ... 
{'category': 'accessories', 'subcategories': ['belt'], 'prefixes': 1, 'suffixes': 1, 'baseType': 'Heavy Belt'}              1
{'category': 'armour', 'subcategories': ['shield'], 'prefixes': 3, 'suffixes': 3, 'baseType': 'Crimson Round Shield'} 

In [9]:
# name is needed to distinguish between unique and other items
items.drop(columns=['abyssJewel', 'additionalProperties', 'artFilename', 'cisRaceReward', 'cosmeticMods', 'delve', 'descrText', 'elder', 'flavourText', 'h', 'hybrid', 'icon', 
                  'id', 'inventoryId', 'maxStackSize', 'name', 'note', 'prophecyText', 'seaRaceReward', 'secDescrText', 'shaper', 
                    'socketedItems', 'stash_note', 'verified',
                  'support', 'thRaceReward', 'utilityMods', 'w', 'x', 'y'], inplace=True, errors='ignore')

In [16]:
items.columns

Index(['abyssJewel', 'additionalProperties', 'artFilename', 'corrupted',
       'craftedMods', 'delve', 'descrText', 'duplicated', 'elder',
       'enchantMods', 'explicitMods', 'extended', 'flavourText', 'frameType',
       'h', 'hybrid', 'icon', 'id', 'identified', 'ilvl', 'implicitMods',
       'incubatedItem', 'influences', 'inventoryId', 'itemLevel', 'league',
       'maxStackSize', 'name', 'nextLevelRequirements', 'note', 'properties',
       'prophecyText', 'requirements', 'secDescrText', 'shaper',
       'socketedItems', 'sockets', 'stackSize', 'stash_note', 'support',
       'synthesised', 'talismanTier', 'typeLine', 'utilityMods', 'veiled',
       'veiledMods', 'verified', 'w', 'x', 'y', 'subCategory', 'category',
       'priceCurrency', 'priceQuantity'],
      dtype='object')

## Currencies

In [11]:
currencies = items[items.category=='currency']

In [12]:
currencies = currencies[currencies.typeLine.isin(CURRENCY_NAME)]

currencies['sellCurrency'] = currencies.typeLine.apply(lambda y: CURRENCY_KEY[CURRENCY_NAME.index(y)])

In [13]:
currencies.rename(columns={"stackSize": "sellQuantity"}, inplace=True)

In [14]:
sellerCRate = []
buyerCRate = []
for k, v in currencies.iterrows():
    buyerCRate.append(int(v.sellQuantity) / int(v.priceQuantity))
    sellerCRate.append(int(v.priceQuantity) / int(v.sellQuantity))
    
currencies['sellRate'] = sellerCRate
currencies['buyRate'] = buyerCRate

In [15]:
# name is needed to distinguish between unique and other items
currencies.drop(columns=['additionalProperties', 'corrupted', 'craftedMods', 'delve',
       'duplicated', 'enchantMods', 'explicitMods', 'extended', 'fractured',
       'fracturedMods', 'frameType', 'identified', 'ilvl', 'implicitMods', 'incubatedItem',
       'influences', 'itemLevel', 'name',
       'nextLevelRequirements', 'properties', 'requirements',
       'sockets', 'stackSize', 'stash_note', 'synthesised', 'talismanTier',
       'typeLine', 'veiled', 'veiledMods', 'subCategory', 'category'], inplace=True, errors='ignore')

In [16]:
currencies.sample()

Unnamed: 0,league,sellQuantity,priceCurrency,priceQuantity,sellCurrency,sellerRate,buyerRate
114,Delirium,20.0,chaos,7,jew,0.35,2.857143


## Mod based items

In [15]:
def items_mods_formatting(items, target_mod):
    mitems_mods_vocabulary = []
    mitems_mods = []
    for k, v in items.iterrows():
        if isinstance(v[target_mod], list):
            for mod in v[target_mod]:
                item_mod = dict()
                item_mod['itemId'] = k
                generic_mod = re.sub('\d+', '#', mod)
                try:
                    mod_index = mitems_mods_vocabulary.index(generic_mod)
                except:
                    mitems_mods_vocabulary.append(generic_mod)
                    mod_index = len(mitems_mods_vocabulary) - 1
                item_mod['modId'] = mod_index
                for i, v in enumerate(re.findall('\d+', mod)):
                    item_mod['value{}'.format(i)] = v
                mitems_mods.append(item_mod)
    mitems_mods = pd.DataFrame(mitems_mods)
    mitems_mods_vocabulary = pd.DataFrame(mitems_mods_vocabulary, columns=['text'])
    return mitems_mods, mitems_mods_vocabulary

In [16]:
mitems = items[items.category.isin(['accessories', 'armour', 'jewels', 'weapons'])].copy()

In [17]:
mitems['rarity'] = mitems.frameType.apply(lambda y: 'normal' if y == 0 else 'magic' if y == 1 else 'rare' if y == 2 else 'unique' if y == 3 else 'relic')
mitems = mitems[~mitems.rarity.isin(['relic', 'unique'])]

In [18]:
# pandas built-in filling functionality
mitems.corrupted.fillna(False, inplace=True)
mitems['duplicated'].fillna(False, inplace=True)
mitems.fractured.fillna(False, inplace=True)
mitems.synthesised.fillna(False, inplace=True)
# items.note.fillna('', inplace=True)

In [None]:
mitems['nPrefixes'] = mitems.extended.apply(lambda y: y['prefixes'] if 'prefixes' in y else 0)
mitems['nSuffixes'] = mitems.extended.apply(lambda y: y['suffixes'] if 'suffixes' in y else 0)
mitems['nExplicitMods'] = mitems.nPrefixes + mitems.nSuffixes

mitems['nImplicitMods'] = mitems.implicitMods.apply(lambda y: len(y) if isinstance(y, list) else 0)


In [20]:
mitems_imp_mods, voc_imp_mods = items_mods_formatting(mitems, 'implicitMods')
mitems_exp_mods, voc_exp_mods = items_mods_formatting(mitems, 'explicitMods')
mitems_enc_mods, voc_enc_mods = items_mods_formatting(mitems, 'enchantMods')
mitems_cra_mods, voc_cra_mods = items_mods_formatting(mitems, 'craftedMods')
mitems_fra_mods, voc_fra_mods = items_mods_formatting(mitems, 'fracturedMods')

voc_imp_mods.shape, voc_exp_mods.shape, voc_enc_mods.shape, voc_cra_mods.shape, voc_fra_mods.shape

((264, 1), (780, 1), (810, 1), (199, 1), (63, 1))

In [21]:
mitems.drop(columns=['typeLine', 'extended', 'frameType', 'implicitMods', 'explicitMods', 'enchantMods', 'craftedMods', 'fracturedMods'],
            inplace=True, errors='ignore')

### Jewels

Defined mods: explicitMods, fracturedMods, implicitMods

In [22]:
jewels = mitems[mitems.category == 'jewels'].copy()
jewels.shape

(32330, 28)

In [23]:
jewels['lvlRequirement'] = jewels['requirements'].apply(lambda y: y[0]['values'][0][0] if pd.notna(y) else 0)

In [24]:
jewels['isAbyss'] = jewels['subCategory'].apply(lambda y: True if y == 'abyss' else False)

In [25]:
jewels.drop(columns=['additionalProperties', 'category', 'delve', 'incubatedItem', 'influences', 
                     'itemLevel', 'nextLevelRequirements', 'properties', 'requirements', 'sockets',
                     'stackSize', 'subCategory', 'talismanTier', 'veiled', 'veiledMods'], 
            errors='ignore', inplace=True)

In [26]:
jewels.sample()

Unnamed: 0,corrupted,duplicated,fractured,identified,ilvl,league,synthesised,priceCurrency,priceQuantity,rarity,nPrefixes,nSuffixes,nAffixes,lvlRequirement,isAbyss
29512,False,False,False,True,75,Standard,False,chaos,10,rare,1,2,3,0,False


### Wearable

In [27]:
def items_prop_formatting(items, target_prop):
    mitems_props_vocabulary = []
    mitems_props = []
    
    t1 = []
    for k, v in items.iterrows():
        if isinstance(v[target_prop], list):
            for prop in [p for p in v[target_prop] if len(p['values']) > 0]:
                item_prop = dict()
                item_prop['itemId'] = k
                generic_prop = prop['name']
                
                try:
                    prop_index = mitems_props_vocabulary.index(generic_prop)
                except:
                    mitems_props_vocabulary.append(generic_prop)
                    prop_index = len(mitems_props_vocabulary) - 1
                item_prop['propId'] = prop_index
                for i, v in enumerate(prop['values'][0][0].split('-')):
                    item_prop['value{}'.format(i)] = float(re.sub('(\+|-|%)', '', v))
                mitems_props.append(item_prop)
    mitems_props = pd.DataFrame(mitems_props)
    mitems_props_vocabulary = pd.DataFrame(mitems_props_vocabulary, columns=['text'])
    return mitems_props, mitems_props_vocabulary

In [28]:
wearable = mitems[mitems.category.isin(['armours', 'weapons', 'accessories'])].copy()
wearable.shape

(43583, 28)

In [29]:
# split veiledMods into a much intuitive data structure
wearable['veiledMods'] = wearable.veiledMods.apply(lambda y: sum(['Prefix' in v or 'Suffix' in v for v in y]) if isinstance(y, list) else 0)

In [30]:
wearable_prop, voc_wearable_prop = items_prop_formatting(wearable, 'properties')

In [31]:
# extract items requirements
itemLvlRequirement, itemDexRequirement, itemStrRequirement, itemIntRequirement = [], [], [], []

for k, v in wearable.iterrows():
    iLvlReq, iStrReq, iDexReq, iIntReq = 0, 0, 0, 0
    if isinstance(v['requirements'], list):
        for ireq in v['requirements']:
            req_value = int(ireq['values'][0][0])

            if ireq['name'] == 'Level':
                iLvlReq = req_value
            elif ireq['name'] in ['Str', 'Strength']:
                iStrReq = req_value
            elif ireq['name'] in ['Dex', 'Dexterity']:
                iDexReq = req_value
            elif ireq['name'] in ['Int', 'Intelligence']:
                iIntReq = req_value

    itemLvlRequirement.append(iLvlReq)
    itemDexRequirement.append(iStrReq)
    itemStrRequirement.append(iDexReq)
    itemIntRequirement.append(iIntReq)
        
wearable['lvlRequirement'] = itemLvlRequirement
wearable['dexRequirement'] = itemDexRequirement
wearable['strRequirement'] = itemStrRequirement
wearable['intRequirement'] = itemIntRequirement

In [36]:
wearable_infl_voc = list()
wearable_infl = list()
for k, influences in wearable.influences.iteritems():
    if isinstance(influences, dict):
        for name, _ in influences.items():
            item_influence = {'itemID': k}
            try:
                item_influence['influenceID'] = wearable_infl_voc.index(name) 
            except ValueError:
                wearable_infl_voc.append(name)
                item_influence['influenceID'] = len(wearable_infl_voc) - 1
            wearable_infl.append(item_influence)
wearable_infl_voc = pd.DataFrame(wearable_infl_voc)
wearable_infl = pd.DataFrame(wearable_infl)

In [85]:
wearable_sockets = list()
for k, v in wearable.sockets.iteritems():
    if not isinstance(v, list):
        continue
    
    temp = []
    for socket in v:
        if len(temp) <= socket['group']:
            temp.append([])
        temp[socket['group']].append(socket['sColour'])

    temp_group = np.array([len(tv) for tv in temp]).argsort()[::-1]
    for i, l in enumerate(temp_group):
        for v in temp[l]:
            wearable_sockets.append({'itemID':k , 'group':i, 'colour': v})
wearable_sockets = pd.DataFrame(wearable_sockets)

wearable['nSockets'] = wearable.sockets.apply(lambda y: sum([len(group) for group in y]) if isinstance(y, list) else 0)
wearable['nLinks'] = wearable.sockets.apply(lambda y: max([len(group) for group in y]) if isinstance(y, list) else 0)

In [149]:
wearable.columns

Index(['additionalProperties', 'corrupted', 'delve', 'duplicated', 'fractured',
       'identified', 'ilvl', 'incubatedItem', 'influences', 'itemLevel',
       'league', 'nextLevelRequirements', 'properties', 'requirements',
       'sockets', 'stackSize', 'synthesised', 'talismanTier', 'veiled',
       'veiledMods', 'subCategory', 'category', 'priceCurrency',
       'priceQuantity', 'rarity', 'nPrefixes', 'nSuffixes', 'nAffixes',
       'lvlRequirement', 'dexRequirement', 'strRequirement', 'intRequirement'],
      dtype='object')

In [136]:
items[items[w].notna()]['category'].value_counts()

currency    186
Name: category, dtype: int64

In [138]:
w = 'fractured'
wearable[wearable[w].notna()][w].value_counts()

False    43538
True        45
Name: fractured, dtype: int64

In [None]:
wea

In [None]:
'additionalProperties', 'delve',
'identified', 'ilvl', 'incubatedItem', 'influences', 'itemLevel',
'league', 'nextLevelRequirements', 'properties', 'requirements',
'sockets', 'stackSize', 'synthesised', 'talismanTier', 'veiled',
'veiledMods', 'subCategory', 'category', 'priceCurrency',
'priceQuantity', 'rarity', 'nPrefixes', 'nSuffixes', 'nAffixes',
'lvlRequirement', 'dexRequirement', 'strRequirement', 'intRequirement',
'nSockets', 'nLinks'