# Data Base

By: Javier Martínez

In [1]:
import json
import pandas as pd
from datetime import datetime

### Reading Data

In [2]:
# You can safely assume that `build_dataset` is correctly implemented
def build_dataset(path):
    data = [json.loads(x) for x in open(path)]

    #================================
    def lavel_training(x,lavel=1):
        """
        Insert key in dictionary
        """
        x['training_data'] = lavel
        return x
    #================================

    target = lambda x: x.get("condition")
    N = -10000
    X_train = [lavel_training(x,lavel=1) for x in data[:N]]
    X_test = [lavel_training(x,lavel=0) for x in  data[N:]]
    y_train = [target(x) for x in X_train]
    y_test = [target(x) for x in X_test]
    # for x in X_test:
    #     del x["condition"]
    return X_train, y_train, X_test, y_test

In [3]:
# Reading Data
path = 'data/MLA_100k_checked_v3.jsonlines'
X_train, y_train, X_test, y_test = build_dataset(path)

In [4]:
# Add data
data = X_train + X_test

### Creating variables

In [5]:
# All keys
keys_list = []
for doc in data:
    for key_ in doc.keys():
        keys_list.append(key_)
    
keys_list = list(set(keys_list))

In [6]:
# Get values
data_dict = {}
for variable in keys_list :
    data_dict[variable] = list(map( lambda x: x.get(variable), data ))

all_features  = list(data_dict.keys())

Delete features:

* international_delivery_mode: All NULL.
* listing_source: All NULL ({''}).
* coverage_areas: All NULL ([])..
* differential_pricing: All NULL.
* subtitle: All NULL ({None}).
* descriptions: equal to column ID. 
* site_id : same encoding 'MLA').

In [7]:
# features dropped
features_dropped = ['international_delivery_mode',
                    'listing_source',
                    'coverage_areas',
                    'differential_pricing',
                    'subtitle',
                    'descriptions',
                    'site_id',
                    ]
for feature in features_dropped:
    del data_dict[feature]

Search the word new (nuevo) or estrenar

In [8]:
# title
data_dict['title_new'] = [1 if (x.upper().find('NUEV')>0\
    or x.upper().find('NEW')>0\
    or x.upper().find('ESTREN')>0\
    ) else 0 for x in data_dict['title']]

Creating boolean features

In [9]:
# shipping
data_dict['local_pick_up'] = [1*x.get('local_pick_up') for x in data_dict['shipping']]
data_dict['free_shipping'] = [1*x.get('free_shipping') for x in data_dict['shipping']]
data_dict['mode'] = [1*x.get('mode') for x in data_dict['shipping']]

In [10]:
# variations
data_dict['variations_boolean'] = [1*(x!=[]) for x in data_dict['variations']]

In [11]:
# accepts_mercadopago
data_dict['accepts_mercadopago_boolean'] = [1*x for x in data_dict['accepts_mercadopago']]

In [12]:
# currency_id
data_dict['currency_id_boolean'] = [1*(x=='ARS') for x in data_dict['currency_id']]

In [13]:
# currency_id
data_dict['date_created_format'] = [datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.000Z') for x in data_dict['date_created']]

In [14]:
# attributes
data_dict['attributes_boolean'] = [1*(x!=[]) for x in data_dict['attributes']]

In [15]:
# automatic_relist
data_dict['automatic_relist_boolean'] = [1*(x) for x in data_dict['automatic_relist']]

In [16]:
# video_id
data_dict['video_id_boolean'] = [1 if x is not None else 0 for x in data_dict['video_id']]

In [17]:
# sub_status
data_dict['sub_status_new'] = [x[0] if x!=[] else None for x in data_dict['sub_status']  ]

In [18]:
# deal_ids
data_dict['deal_ids_new'] = [x[0] if x!=[] else None for x in data_dict['deal_ids']  ]

In [19]:
# seller_address
data_dict['seller_address_country'] = [1*x.get('country').get('id')  for x in data_dict['seller_address']]
data_dict['seller_address_state'] = [1*x.get('state').get('id')  for x in data_dict['seller_address']]
data_dict['seller_address_city'] = [1*x.get('city').get('id')  for x in data_dict['seller_address']]

### Pandas Data Frame

In [20]:
#==============
def condition_format(x):
    """
    Condition boolean
    """

    if x=='new':
        return 1
    elif x=='used':
        return 0
    else:
        return x
#==============


# DataFrame
pd_data = pd.DataFrame(data_dict)
pd_data['condition_new'] = pd_data['condition'].apply(condition_format,1)


# save data
pd_data.to_pickle('./data/data_base.pkl')