A first implementation of an ETL procedure is implemented and tested. This task is based and reuse code implemented in *0_data_first_exploration*

In [26]:
import os
import json

import re

import pandas as pd
import numpy as np

from smart_open import open

In [6]:
pd.set_option('display.max_columns', 500)

LEAGUE_NAME = None

In [7]:
nci = pd.read_csv('../../data/next_change_id/next_change_id_cleaned.csv', index_col=0)

In [17]:
STARTING_NCI = '606710732-624261706-592413894-672863464-640396486'

BASE_URL = 'http://www.pathofexile.com/api/public-stash-tabs?id='

LEAGUE_NAME = ['Metamorth']

In [18]:
CURRENCY_KEY = ['alt', 'fuse', 'alch', 'chaos', 'gcp', 'exa', 'chrom', 'jew', 'chance', 'chisel', 'scour', 'blessed', 'regret', 'regal', 'divine', 'vaal', 'silver', 'coin']
CURRENCY_NAME = ["Orb of Alteration", "Orb of Fusing", "Orb of Alchemy", "Chaos Orb", "Gemcutter's Prism", "Exalted Orb", "Chromatic Orb", "Jeweller's Orb", "Orb of Chance", "Cartographer's Chisel", "Orb of Scouring", "Blessed Orb", "Orb of Regret", "Regal Orb", "Divine Orb", "Vaal Orb", "Silver Coin"]

ITEMS_CATEGORIES = ['accessories', 'armour', 'jewels', 'weapons', 'currency']

In [43]:
def reached_new_nci(curr_nci, next_nci):
    next_nci = next_nci.split('-')
    curr_nci = curr_nci.split('-')
    return np.mean([int(v1) - int(v2) for v1, v2 in zip(curr_nci, next_nci)]) > 0

In [20]:
def get_stashes_dict(path):
    with open(path, 'rb') as file:
        return json.load(file)

def get_stashes_items(stashes, verbose=False):
    values = list()
    
    for stash in stashes['stashes']:
        if len(stash['items']) > 0 and stash['public']:
            # add stash related data to single items
            for item in stash['items']:
                item['stash_note'] = stash['stash']
                item['league'] = stash['league']
            values.extend(stash['items'])
    return values

In [21]:
def get_string_price(string):
    if re.match('(~price|~b/o)\s\d+((\.|/)\d+)?\s\w+', string) is not None:
        tokens = string.split(' ')       
        currency, quantity = tokens[2], int(tokens[1])
        # check if currency and quantity are allowed values
        if currency in CURRENCY_KEY:
            return currency, quantity
    raise ValueError()
    
def fitler_items_price(items):
    price = []
    for k, v in items.iterrows():
        try:
            # item has a valid price
            item_price = get_string_price(v.note)
        except:
            try:
                # stash has a valid price
                item_price = get_string_price(v.stash_note)
            except:
                # both item and stash have no valid price
                item_price = (None, None)
        price.append(item_price)
    (items['priceCurrency'], items['priceQuantity']) = zip(*price)
    items.drop(index = items[((items.priceCurrency.isna()) & (items.priceQuantity.isna())) | (items.priceQuantity == 0)].index, inplace=True)
    return items

def get_items_rarity(items):
    items_rarity = list()
    for k, item in items.iterrows():
        if isinstance(item.flavourText, list):
            # only unique items have a flavour text defined
            items_rarity.append('unique')    
        elif len(item.explicitMods) == 0:
            items_rarity.append('normal')
        elif 1 <= len(item.explicitMods) <= 2: 
            items_rarity.append('magic')
        elif 3 <= len(item.explicitMods): 
            items_rarity.append('rare')
    return items_rarity

In [61]:
def transform_generic_items(stashes):
    items = pd.DataFrame(get_stashes_items(stashes))
    
    # pandas built-in filling functionality
    items.corrupted.fillna(False, inplace=True)
    items['duplicated'].fillna(False, inplace=True)
    items.elder.fillna(False, inplace=True)
    items.fractured.fillna(False, inplace=True)
    items.isRelic.fillna(False, inplace=True)
    items.shaper.fillna(False, inplace=True)
    items.synthesised.fillna(False, inplace=True)
    items.note.fillna('', inplace=True)
    
    # fill missing values with empty list
    items['craftedMods'] = items.craftedMods.apply(lambda y: y if isinstance(y, list) else [])
    items['enchantMods'] = items.enchantMods.apply(lambda y: y if isinstance(y, list) else [])
    items['explicitMods'] = items.explicitMods.apply(lambda y: y if isinstance(y, list) else [])
    items['implicitMods'] = items.implicitMods.apply(lambda y: y if isinstance(y, list) else [])
    items['fracturedMods'] = items.fracturedMods.apply(lambda y: y if isinstance(y, list) else [])
    items['veiledMods'] = items.veiledMods.apply(lambda y: y if isinstance(y, list) else [])
    items['sockets'] = items.sockets.apply(lambda y: y if isinstance(y, list) else [])
    items['properties'] = items.properties.apply(lambda y: y if isinstance(y, list) else [])
    items['requirements'] = items.requirements.apply(lambda y: y if isinstance(y, list) else [])
    
    items['subCategory'] = items.category.apply(lambda y: list(y.items())[0][1][0] if len(list(y.items())[0][1]) > 0 else None)
    items['category'] = items.category.apply(lambda y: list(y.items())[0][0])
    
    items = fitler_items_price(items)
    
    for v in ['abyssJewel', 'additionalProperties', 'artFilename', 'cisRaceReward', 'cosmeticMods', 'delve', 'descrText', 'frameType', 'h', 'icon', 'id', 'inventoryId', 'maxStackSize', 'nextLevelRequirements', 'note', 'prophecyText', 'seaRaceReward', 'secDescrText', 'socketedItems', 'stash_note', 'support', 'thRaceReward', 'utilityMods', 'verified', 'w', 'x', 'y']:
        del(items[v])
    
    return items

def transform_currencies():
    currencies = items[items.category=='currency']
    
    currencies = currencies[currencies.typeLine.isin(CURRENCY_NAME)]

    currencies['sellCurrency'] = currencies.typeLine.apply(lambda y: CURRENCY_KEY[CURRENCY_NAME.index(y)])
    
    currencies.rename(columns={"stackSize": "sellQuantity"}, inplace=True)
    
    sellerCRate = []
    buyerCRate = []
    for k, v in currencies.iterrows():
        sellerCRate.append(int(v.sellQuantity) / int(v.priceQuantity))
        buyerCRate.append(int(v.priceQuantity) / int(v.sellQuantity))
    currencies['sellerRate'] = sellerCRate
    currencies['buyerRate'] = buyerCRate
    
    for v in ['category', 'corrupted', 'craftedMods', 'duplicated', 'elder', 'enchantMods', 'explicitMods', 'flavourText', 'fractured', 'fracturedMods', 'identified', 'ilvl', 'implicitMods', 'isRelic', 'name', 'properties', 'requirements', 'shaper', 'sockets' ,'synthesised', 'subCategory', 'talismanTier', 'typeLine', 'vaal', 'veiled', 'veiledMods']:
        del(items[v])
    
def transform_jewels(items):
    jewels = items[items.category=='jewels']
    
    jewels['rarity'] = get_items_rarity(jewels)
    
    jewels_radius = list()
    for k, v in jewels['properties'].iteritems():
        if not isinstance(v, list):
            jewels_radius.append('None')
        else:
            j_rad = None
            for j_prop in v:
                if j_prop['name'] == 'Radius':
                    j_rad = j_prop['values'][0][0]
            jewels_radius.append(j_rad if j_rad is not None else 'None')
            
    jewels['radius'] = jewels_radius
    jewels['lvlRequirement'] = jewels['requirements'].apply(lambda y: y[0]['values'][0][0] if len(y) > 0 else 0).value_counts()
    jewels['isAbyss'] = jewels['subCategory'].apply(lambda y: True if y == 'abyss' else False)
    
    for v in ['category', 'craftedMods', 'elder', 'enchantMods', 'flavourText', 'name', 'properties', 'requirements', 'shaper', 'sockets', 'stackSize', 'subCategory', 'talismanTier', 'typeLine', 'vaal', 'veiledPrefix', 'veiledSuffix']:
        del(jewels[v])
    
    return jewels

def transform_wearable_items(items):    

    items['rarity'] = get_items_rarity(items)
    
    # split veiledMods into a much intuitive data structure
    items['veiledMods'] = items.veiledMods.apply(lambda y: sum(['Prefix' in v or 'Suffix' in v for v in y]))
    
    # extract items base quality
    items_base_quality = list()
    for k, v in items.iterrows():
        item_quality = 0
        for i, prop in enumerate(v.properties):
            if prop['name'] == 'Quality':            
                item_quality = int(prop['values'][0][0][1:-1])
                del(v.properties[i])

        for mod in v.craftedMods:
            if re.match(r'^\+\d+%\sto\sQuality$', mod):
                item_quality -= int(mod.split(' ')[0][1:-1])

        items_base_quality.append(item_quality)

    items['baseQuality'] = items_base_quality
    
    # remove items useless properties such as item name (all the properties which don't have a defined value)
    items['properties'] = pd.Series([pname for iprop in items.properties.apply(lambda y: [v['name'] for v in y]) for pname in iprop]).value_counts()

    # extract items requirements
    itemLvlRequirement, itemDexRequirement, itemStrRequirement, itemIntRequirement = [], [], [], []
    for k, v in items.iterrows():
        iLvlReq, iStrReq, iDexReq, iIntReq = 0, 0, 0, 0
        for ireq in v['requirements']:
            req_value = int(ireq['values'][0][0])

            if ireq['name'] == 'Level':
                iLvlReq = req_value
            elif ireq['name'] in ['Str', 'Strength']:
                iStrReq = req_value
            elif ireq['name'] in ['Dex', 'Dexterity']:
                iDexReq = req_value
            elif ireq['name'] in ['Int', 'Intelligence']:
                iIntReq = req_value
        itemLvlRequirement.append(iLvlReq)
        itemDexRequirement.append(iStrReq)
        itemStrRequirement.append(iDexReq)
        itemIntRequirement.append(iIntReq)
    items['lvlRequirement'] = itemLvlRequirement
    items['dexRequirement'] = itemDexRequirement
    items['strRequirement'] = itemStrRequirement
    items['intRequirement'] = itemIntRequirement
    
    # extract items sockets into a separate dataframe
    items_sockets = list()
    for k, v in items.iterrows():
        for socket in v['sockets']:
            item_socket = dict()
            item_socket['itemId'] = k
            item_socket['group'] = socket['group']
            item_socket['socketColour'] = socket['sColour']

            items_sockets.append(item_socket)
    items_sockets = pd.DataFrame(items_sockets)
    
    # items mods
    items_crafted_mods, crafted_mods = items_mods_formatting(items, 'craftedMods')
    items_enchant_mods, enchant_mods = items_mods_formatting(items, 'enchantMods')
    items_explicit_mods, explicit_mods = items_mods_formatting(items, 'explicitMods')
    items_fractured_mods, fractured_mods = items_mods_formatting(items, 'fracturedMods')
    items_implicit_mods, implicit_mods = items_mods_formatting(items, 'implicitMods')
    
    # deletable data
    deletable = ['name', 'properties', 'requirements', 'sockets', 'stackSize', 'typeLine', 'vaal', 'craftedMods', 'enchantMods', 'explicitMods', 'flavourText', 'fracturedMods', 'implicitMods', 'veiled']
    missing = list()
    for v in deletable:
        try:
            del(items[v])
        except:
            missing.append(v)

    if len(missing) > 0:
        print('Missing columns: {}'.format(missing))
        
    return items
        
def transform_weapons(items):
    return transform_wearable_items(items[items.category=='weapons'])
    
def transform_armours(items):
    return transform_wearable_items(items[items.category=='armour'])
    
def transform_accessories(items):
    return transform_wearable_items(items[items.category=='accessories'])
    

In [76]:
a = '607495852-625043701-593194195-673707096-641274798'
b = '607537014-625079404-593232312-673756689-641313696'

([int((int(v2) + int(v1)) / 2) for v1, v2 in zip(a.split('-'), b.split('-'))])

[607516433, 625061552, 593213253, 673731892, 641294247]

In [37]:
%%time

# EXTRACT: access poe stash tab api
currenct_nci = '606710732-624261706-592413894-672863464-640396486'
next_nci = '606725552-624277463-592429392-672877972-640415214'
while not reached_new_nci(currenct_nci, next_nci):
    with open(BASE_URL + currenct_nci, 'rb') as file:
        stashes = json.load(file)
    
    currenct_nci = stashes['next_change_id']
    
#     # TRANSFORM: process data
#     items = extract_generic_items(stashes)
    
#     currencies = transform_currencies(items)
    
#     jewels = transform_jewels(items)
    
#     armours = transform_armours(items)
#     weapons = transform_weapons(items)
#     accessories = transform_accessories(items)
    
#     # LOAD: load data into a datalake
#     break

TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [24]:
for stash in stashes['stashes']:
    if stash['league'] is not None:
        print(stash['league'])

NameError: name 'stashes' is not defined

In [11]:
stashes['stashes'][0]['items']

[]

In [None]:
for stash in stashes['stashes']:
    for item in stash['items']:
        

# Transform