The goal of this notebook is to understand the original data structure and define basic operations.

The source data explored here has been previously downloaded from Path of Exile stash tabs API to local .txt files. Each file contained in a specific folder represent a single API call to http://www.pathofexile.com/api/public-stash-tabs?id= with a proper **nextChangeId**: this id is iteratively applyed on the next API call.

# TODO

1. add column to distinguish between unique and other items

In [1]:
import os
import re
import json

import pandas as pd

In [2]:
pd.set_option('display.max_columns', 500)

DATA_PATH = '../../data/temp/stashes/'
stashes_path = os.listdir(DATA_PATH)

LEAGUE = 'Metamorph'

CURRENCY_KEY = ['alt', 'fuse', 'alch', 'chaos', 'gcp', 'exa', 'chrom', 'jew', 'chance', 'chisel', 'scour', 'blessed', 'regret', 'regal', 'divine', 'vaal', 'silver']
CURRENCY_NAME = ["Orb of Alteration", "Orb of Fusing", "Orb of Alchemy", "Chaos Orb", "Gemcutter's Prism", "Exalted Orb", "Chromatic Orb", "Jeweller's Orb", "Orb of Chance", "Cartographer's Chisel", "Orb of Scouring", "Blessed Orb", "Orb of Regret", "Regal Orb", "Divine Orb", "Vaal Orb", "Silver Coin"]

ITEMS_CATEGORIES = ['accessories', 'armour', 'jewels', 'weapons', 'currency']

In [3]:
def get_stashes_dict(path):
    with open(path, 'rb') as file:
        return json.load(file)

def get_stashes_items(verbose=False):
    values = list()
    
    for v in stashes_path:
        for stash in get_stashes_dict(DATA_PATH + v):
            if len(stash['items']) > 0 and stash['public']:
                # add stash related data to single items
                for item in stash['items']:
                    item['stash_note'] = stash['stash']
                    item['league'] = stash['league']
                values.extend(stash['items'])
    return values

def get_string_price(string):
    if re.match('(~price|~b/o)\s\d+((\.|/)\d+)?\s\w+', string) is not None:
        tokens = string.split(' ')
        
        currency, quantity = tokens[2], int(tokens[1])
        
        # check if currency and quantity are allowed values
        if currency in CURRENCY_KEY:
            return currency, quantity
    raise ValueError()

# Data exploration

In [4]:
items = get_stashes_items()

items = pd.DataFrame(items)

# alphabetically order dataframe columns
items = items[sorted(items.columns)]

items.shape

(432781, 54)

In [5]:
# pandas built-in filling functionality
items.corrupted.fillna(False, inplace=True)
items['duplicated'].fillna(False, inplace=True)
items.elder.fillna(False, inplace=True)
items.fractured.fillna(False, inplace=True)
items.isRelic.fillna(False, inplace=True)
items.shaper.fillna(False, inplace=True)
items.synthesised.fillna(False, inplace=True)
items.note.fillna('', inplace=True)

In [6]:
# fill missing values with empty list
items['craftedMods'] = items.craftedMods.apply(lambda y: y if isinstance(y, list) else [])
items['enchantMods'] = items.enchantMods.apply(lambda y: y if isinstance(y, list) else [])
items['explicitMods'] = items.explicitMods.apply(lambda y: y if isinstance(y, list) else [])
items['implicitMods'] = items.implicitMods.apply(lambda y: y if isinstance(y, list) else [])
items['fracturedMods'] = items.fracturedMods.apply(lambda y: y if isinstance(y, list) else [])
items['veiledMods'] = items.veiledMods.apply(lambda y: y if isinstance(y, list) else [])
items['sockets'] = items.sockets.apply(lambda y: y if isinstance(y, list) else [])
items['properties'] = items.properties.apply(lambda y: y if isinstance(y, list) else [])
items['requirements'] = items.requirements.apply(lambda y: y if isinstance(y, list) else [])

In [7]:
# split items category by adding subcategory
items['subCategory'] = items.category.apply(lambda y: list(y.items())[0][1][0] if len(list(y.items())[0][1]) > 0 else None)
items['category'] = items.category.apply(lambda y: list(y.items())[0][0])

items.drop(index=items[~items.category.isin(ITEMS_CATEGORIES)].index, inplace=True)

In [8]:
# extract items price and drop unpriced items
price = []

for k, v in items.iterrows():
    try:
        # item has a valid price
        item_price = get_string_price(v.note)
    except:
        try:
            # stash has a valid price
            item_price = get_string_price(v.stash_note)
        except:
            # both item and stash have no valid price
            item_price = (None, None)
        
    price.append(item_price)

(items['priceCurrency'], items['priceQuantity']) = zip(*price)
    
# drop unpriced items
items.drop(index = items[((items.priceCurrency.isna()) & (items.priceQuantity.isna())) | (items.priceQuantity == 0)].index, inplace=True)

In [9]:
# delete useless data

deletable = ['abyssJewel', 'additionalProperties', 'artFilename', 'cisRaceReward', 'cosmeticMods', 'delve', 'descrText', 'frameType', 'h', 'icon', 'id', 'inventoryId', 'maxStackSize', 'nextLevelRequirements', 'note', 'prophecyText', 'seaRaceReward', 'secDescrText', 'socketedItems', 'stash_note', 'support', 'thRaceReward', 'utilityMods', 'verified', 'w', 'x', 'y']
missing = list()
for v in deletable:
    try:
        del(items[v])
    except:
        missing.append(v)

if len(missing) > 0:
    print('Missing columns: {}'.format(missing))

## Currencies

In [10]:
currencies = items[items.category=='currency']

In [11]:
currencies = currencies[currencies.typeLine.isin(CURRENCY_NAME)]

currencies['sellCurrency'] = currencies.typeLine.apply(lambda y: CURRENCY_KEY[CURRENCY_NAME.index(y)])

In [12]:
currencies.rename(columns={"stackSize": "sellQuantity"}, inplace=True)

In [13]:
sellerCRate = []
buyerCRate = []
for k, v in currencies.iterrows():
    sellerCRate.append(int(v.sellQuantity) / int(v.priceQuantity))
    buyerCRate.append(int(v.priceQuantity) / int(v.sellQuantity))
    
currencies['sellerRate'] = sellerCRate
currencies['buyerRate'] = buyerCRate

In [14]:
# delete useless data

deletable = ['category', 'corrupted', 'craftedMods', 'duplicated', 'elder', 'enchantMods', 'explicitMods', 'flavourText', 'fractured', 'fracturedMods', 'identified', 'ilvl', 'implicitMods', 'isRelic', 'name', 'properties', 'requirements', 'shaper', 'sockets' ,'synthesised', 'subCategory', 'talismanTier', 'typeLine', 'vaal', 'veiled', 'veiledMods']

missing = list()
for v in deletable:
    try:
        del(currencies[v])
    except:
        missing.append(v)

if len(missing) > 0:
    print('Missing columns: {}'.format(missing))

In [15]:
currencies.sample()

Unnamed: 0,league,sellQuantity,priceCurrency,priceQuantity,sellCurrency,sellerRate,buyerRate
1927,Synthesis,27.0,chaos,1,silver,27.0,0.037037


## Mod based items

In [25]:
def items_mods_formatting(items, target_mod):

    witems_mods_vocabulary = []
    witems_mods = []

    for k, v in items.iterrows():
        for mod in v[target_mod]:
            item_mod = dict()
            item_mod['itemId'] = k

            generic_mod = re.sub('\d+', '##', mod)
            try:
                mod_index = witems_mods_vocabulary.index(generic_mod)
            except:
                witems_mods_vocabulary.append(generic_mod)
                mod_index = len(witems_mods_vocabulary) - 1

            item_mod['modId'] = mod_index

            for i, v in enumerate(re.findall('\d+', mod)):
                item_mod['value{}'.format(i)] = v

            witems_mods.append(item_mod)

    witems_mods = pd.DataFrame(witems_mods)
    witems_mods_vocabulary = pd.DataFrame(witems_mods_vocabulary, columns=['text'])
    
    return witems_mods, witems_mods_vocabulary

### Items rarity

### Categories based analysis

In [79]:
mod_items = items[items.category.isin(['armour', 'accessories', 'weapons', 'jewels']) ]

temp_mod_items = mod_items.copy()
temp_mod_items['craftedMods'] = mod_items.craftedMods.apply(lambda y : [re.sub('\d+(.\d+)?', 'N', v) for v in y])
temp_mod_items['enchantMods'] = mod_items.enchantMods.apply(lambda y : [re.sub('\d+(.\d+)?', 'N', v) for v in y])
temp_mod_items['explicitMods'] = mod_items.explicitMods.apply(lambda y : [re.sub('\d+(.\d+)?', 'N', v) for v in y]).apply(lambda y: [v.split(':')[1].strip() if ':' in v else v for v in y])
temp_mod_items['fracturedMods'] = mod_items.fracturedMods.apply(lambda y : [re.sub('\d+(.\d+)?', 'N', v) for v in y])
temp_mod_items['implicitMods'] = mod_items.implicitMods.apply(lambda y : [re.sub('\d+(.\d+)?', 'N', v) for v in y])

In [80]:
crafted_s = set([v for l in temp_mod_items.craftedMods.values for v in l])
enchant_s = set([v for l in temp_mod_items.enchantMods.values for v in l])
explicit_s = set([v for l in temp_mod_items.explicitMods.values for v in l])
fractured_s = set([v for l in temp_mod_items.fracturedMods.values for v in l])
implicit_s = set([v for l in temp_mod_items.implicitMods.values for v in l])
print(len(crafted_s), len(enchant_s), len(explicit_s), len(fractured_s), len(implicit_s))

# mods sets intersections
print(len(crafted_s & enchant_s), len(crafted_s & explicit_s), len(crafted_s & fractured_s), len(crafted_s & implicit_s))
print(len(enchant_s & explicit_s), len(enchant_s & fractured_s), len(enchant_s & implicit_s))
print(len(explicit_s & fractured_s), len(explicit_s & implicit_s))
print(len(fractured_s & implicit_s))

183 489 2089 148 286
1 139 77 86
8 0 0
148 221
87


In [77]:
armour = temp_mod_items[temp_mod_items.category=='armour']
accessories = temp_mod_items[temp_mod_items.category=='accessories']
jewels = temp_mod_items[temp_mod_items.category=='jewels']
weapons = temp_mod_items[temp_mod_items.category=='weapons']

In [78]:
armour_mods_s = set([v for l in armour.implicitMods.values for v in l])
accessories_mods_s = set([v for l in accessories.implicitMods.values for v in l])
jewels_mods_s = set([v for l in jewels.implicitMods.values for v in l])
weapons_mods_s = set([v for l in weapons.implicitMods.values for v in l])
print(len(armour_mods_s), len(accessories_mods_s), len(jewels_mods_s), len(weapons_mods_s))

# intersection between mods sets
print(len(armour_mods_s & accessories_mods_s), len(armour_mods_s & jewels_mods_s), len(armour_mods_s & weapons_mods_s))
print(len(accessories_mods_s & jewels_mods_s), len(accessories_mods_s & weapons_mods_s))
print(len(jewels_mods_s & weapons_mods_s))

108 97 0 65
43 0 20
0 20
0


### Wearable items

In [16]:
witems = items[items.category.isin(['armour', 'weapons', 'accessories'])].copy()

In [17]:
# items rarity
witems_rarity = list()
    
for k, witem in witems.iterrows():
    if isinstance(witem.flavourText, list):
        # only unique items have a flavour text defined
        witems_rarity.append('unique')    
    elif len(witem.explicitMods) == 0:
        witems_rarity.append('normal')
    elif 1 <= len(witem.explicitMods) <= 2: 
        witems_rarity.append('enchanted')
    elif 3 <= len(witem.explicitMods): 
        witems_rarity.append('rare')
    else:
        witems_rarity.append('ERROR')
        
witems['rarity'] = witems_rarity

In [18]:
# split veiledMods into a much intuitive data structure
witems['veiledMods'] = witems.veiledMods.apply(lambda y: sum(['Prefix' in v or 'Suffix' in v for v in y]))

In [19]:
witems_base_quality = list()
for k, v in witems.iterrows():
    witem_quality = 0
    for i, prop in enumerate(v.properties):
        if prop['name'] == 'Quality':            
#             witem_quality = int(v.properties.pop(i)['values'][0][0][1:-1])
            witem_quality = int(prop['values'][0][0][1:-1])
            del(v.properties[i])
            
    for mod in v.craftedMods:
        if re.match(r'^\+\d+%\sto\sQuality$', mod):
            witem_quality -= int(mod.split(' ')[0][1:-1])
    
    witems_base_quality.append(witem_quality)
    
witems['baseQuality'] = witems_base_quality

In [20]:
# remove items useless properties such as item name (all the properties which don't have a defined value)
witems['properties'] = pd.Series([pname for iprop in witems.properties.apply(lambda y: [v['name'] for v in y]) for pname in iprop]).value_counts()

In [22]:
# extract items requirements
itemLvlRequirement, itemDexRequirement, itemStrRequirement, itemIntRequirement = [], [], [], []

for k, v in witems.iterrows():
    iLvlReq, iStrReq, iDexReq, iIntReq = 0, 0, 0, 0
    for ireq in v['requirements']:
        req_value = int(ireq['values'][0][0])
        
        if ireq['name'] == 'Level':
            iLvlReq = req_value
        elif ireq['name'] in ['Str', 'Strength']:
            iStrReq = req_value
        elif ireq['name'] in ['Dex', 'Dexterity']:
            iDexReq = req_value
        elif ireq['name'] in ['Int', 'Intelligence']:
            iIntReq = req_value

    itemLvlRequirement.append(iLvlReq)
    itemDexRequirement.append(iStrReq)
    itemStrRequirement.append(iDexReq)
    itemIntRequirement.append(iIntReq)
        
witems['lvlRequirement'] = itemLvlRequirement
witems['dexRequirement'] = itemDexRequirement
witems['strRequirement'] = itemStrRequirement
witems['intRequirement'] = itemIntRequirement

In [23]:
# extract items sockets into a separate dataframe
witems_sockets = list()
for k, v in witems.iterrows():
    for socket in v['sockets']:
        witem_socket = dict()
        witem_socket['itemId'] = k
        witem_socket['group'] = socket['group']
        witem_socket['socketColour'] = socket['sColour']
        
        witems_sockets.append(witem_socket)
        
witems_sockets = pd.DataFrame(witems_sockets)

In [26]:
witems_crafted_mods, crafted_mods = items_mods_formatting(witems, 'craftedMods')
witems_enchant_mods, enchant_mods = items_mods_formatting(witems, 'enchantMods')
witems_explicit_mods, explicit_mods = items_mods_formatting(witems, 'explicitMods')
witems_fractured_mods, fractured_mods = items_mods_formatting(witems, 'fracturedMods')
witems_implicit_mods, implicit_mods = items_mods_formatting(witems, 'implicitMods')

In [29]:
crafted_mods.shape, enchant_mods.shape, explicit_mods.shape, fractured_mods.shape, implicit_mods.shape

((184, 1), (490, 1), (1747, 1), (98, 1), (271, 1))

In [30]:
deletable = ['name', 'properties', 'requirements', 'sockets', 'stackSize', 'typeLine', 'vaal', 'craftedMods', 'enchantMods', 'explicitMods', 'flavourText', 'fracturedMods', 'implicitMods', 'veiled']

missing = list()
for v in deletable:
    try:
        del(witems[v])
    except:
        missing.append(v)

if len(missing) > 0:
    print('Missing columns: {}'.format(missing))

In [31]:
witems.sample()

Unnamed: 0,category,corrupted,duplicated,elder,fractured,identified,ilvl,isRelic,league,shaper,synthesised,talismanTier,veiledMods,subCategory,priceCurrency,priceQuantity,rarity,baseQuality,lvlRequirement,dexRequirement,strRequirement,intRequirement
261930,armour,False,False,False,False,True,6,False,Hardcore Synthesis,False,False,,0,quiver,alch,1,enchanted,0,4,0,0,0


### Jewels

In [34]:
jewels = items[items.category.isin(['jewels'])].copy()

In [38]:
jewels[jewels.veiledMods.apply(lambda y: len(y) > 0)].veiledMods

Series([], Name: veiledMods, dtype: object)

In [82]:
jewels_radius = list()

for k, v in jewels['properties'].iteritems():
    if not isinstance(v, list):
        jewels_radius.append('None')
        
    else:
        j_rad = None
        for j_prop in v:
            if j_prop['name'] == 'Radius':
                j_rad = j_prop['values'][0][0]
        
        jewels_radius.append(j_rad if j_rad is not None else 'None')
    
jewels['radius'] = jewels_radius

In [98]:
jewels[jewels['requirements'].apply(lambda y: len(y) > 0)]['requirements'].values

array([list([{'name': 'Level', 'values': [['32', 0]], 'displayMode': 0}]),
       list([{'name': 'Level', 'values': [['32', 0]], 'displayMode': 0}]),
       list([{'name': 'Level', 'values': [['32', 0]], 'displayMode': 0}]),
       ...,
       list([{'name': 'Level', 'values': [['41', 0]], 'displayMode': 0}]),
       list([{'name': 'Level', 'values': [['29', 0]], 'displayMode': 0}]),
       list([{'name': 'Level', 'values': [['38', 0]], 'displayMode': 0}])],
      dtype=object)

In [99]:
jewels['lvlRequirement'] = jewels['requirements'].apply(lambda y: y[0]['values'][0][0] if len(y) > 0 else 0).value_counts()

In [71]:
jewels['isAbyss'] = jewels['subCategory'].apply(lambda y: True if y == 'abyss' else False)

In [72]:
deletable = ['category', 'craftedMods', 'elder', 'enchantMods', 'flavourText', 'name',
        'properties', 'requirements', 'shaper', 'sockets', 'stackSize', 'subCategory', 
        'talismanTier', 'typeLine', 'vaal', 'veiledPrefix', 'veiledSuffix']

missing = list()
for v in deletable:
    try:
        del(jewels[v])
    except:
        missing.append(v)

if len(missing) > 0:
    print('Missing columns: {}'.format(missing))

Missing columns: ['veiledPrefix', 'veiledSuffix']


In [74]:
jewels.sample()

Unnamed: 0,corrupted,duplicated,explicitMods,fractured,fracturedMods,identified,ilvl,implicitMods,isRelic,league,synthesised,veiled,veiledMods,priceCurrency,priceQuantity,radius,isAbyss
387313,False,False,"[16% increased Chaos Damage, 6% increased Atta...",False,[],True,83,[],False,Standard,False,,[],exa,5,,False


In [56]:
jewels_mods = set([mod for v in jewels[jewels.implicitMods.apply(lambda y: len(y) > 0)].implicitMods.apply(lambda y: [re.sub('\d+(.\d+)?', 'N', v) for v in y]).values for mod in v])
len(jewels_mods)

26

In [57]:
jewels_mods = set([mod for v in jewels[jewels.explicitMods.apply(lambda y: len(y) > 0)].explicitMods.apply(lambda y: [re.sub('\d+(.\d+)?', 'N', v) for v in y]).values for mod in v])
len(jewels_mods)

529

In [58]:
jewels_mods = set([mod for v in jewels[jewels.fracturedMods.apply(lambda y: len(y) > 0)].fracturedMods.apply(lambda y: [re.sub('\d+(.\d+)?', 'N', v) for v in y]).values for mod in v])
len(jewels_mods)

75