# Data analysis

By: Javier Martínez

In [32]:
import json
import pandas as pd
import plotly.express as px



### Reading Data

In [2]:
# You can safely assume that `build_dataset` is correctly implemented
def build_dataset(path):
    data = [json.loads(x) for x in open(path)]
    target = lambda x: x.get("condition")
    N = -10000
    X_train = data[:N]
    X_test = data[N:]
    y_train = [target(x) for x in X_train]
    y_test = [target(x) for x in X_test]
    for x in X_test:
        del x["condition"]
    return X_train, y_train, X_test, y_test

In [3]:
# Reading Data
path = 'data/MLA_100k_checked_v3.jsonlines'
X_train, y_train, X_test, y_test = build_dataset(path)

In [4]:
# All keys
keys_list = []
for doc in X_train:
    for key_ in doc.keys():
        keys_list.append(key_)
    
keys_list = list(set(keys_list))

In [5]:
# Get values
data = {}
for variable in keys_list :

    data[variable] = list(map( lambda x: x.get(variable), X_train ))

all_features  = list(data.keys())

In [6]:
# features dropped
features_dropped = ['international_delivery_mode',
                    'listing_source',
                    'coverage_areas',
                    'differential_pricing',
                    'subtitle']
for feature in features_dropped:
    del data[feature]

In [7]:
pd_data = pd.DataFrame(data)
pd_data.head()

Unnamed: 0,id,last_updated,start_time,non_mercado_pago_payment_methods,seller_id,accepts_mercadopago,condition,title,automatic_relist,base_price,...,descriptions,seller_address,stop_time,status,price,original_price,site_id,available_quantity,sub_status,variations
0,MLA4695330653,2015-09-05T20:42:58.000Z,1441485773000,"[{'description': 'Transferencia bancaria', 'id...",8208882349,True,new,Auriculares Samsung Originales Manos Libres Ca...,False,80.0,...,[{'id': 'MLA4695330653-912855983'}],"{'country': {'name': 'Argentina', 'id': 'AR'},...",1446669773000,active,80.0,,MLA,1,[],[]
1,MLA7160447179,2015-09-26T18:08:34.000Z,1443290910000,"[{'description': 'Transferencia bancaria', 'id...",8141699488,True,used,Cuchillo Daga Acero Carbón Casco Yelmo Solinge...,False,2650.0,...,[{'id': 'MLA7160447179-930764806'}],"{'country': {'name': 'Argentina', 'id': 'AR'},...",1448474910000,active,2650.0,,MLA,1,[],[]
2,MLA7367189936,2015-09-09T23:57:10.000Z,1441843027000,"[{'description': 'Transferencia bancaria', 'id...",8386096505,True,used,"Antigua Revista Billiken, N° 1826, Año 1954",False,60.0,...,[{'id': 'MLA7367189936-916478256'}],"{'country': {'name': 'Argentina', 'id': 'AR'},...",1447027027000,active,60.0,,MLA,1,[],[]
3,MLA9191625553,2015-10-05T16:03:50.306Z,1443466076000,"[{'description': 'Transferencia bancaria', 'id...",5377752182,True,new,Alarma Guardtex Gx412 Seguridad Para El Automo...,False,580.0,...,[{'id': 'MLA9191625553-932309698'}],"{'country': {'name': 'Argentina', 'id': 'AR'},...",1449191596000,active,580.0,,MLA,1,[],[]
4,MLA7787961817,2015-08-28T13:37:41.000Z,1440454040000,"[{'description': 'Transferencia bancaria', 'id...",2938071313,True,used,Serenata - Jennifer Blake,False,30.0,...,[{'id': 'MLA7787961817-902981678'}],"{'country': {'name': 'Argentina', 'id': 'AR'},...",1445638040000,active,30.0,,MLA,1,[],[]


In [29]:
pd_data.shape[0]

90000

In [31]:
conditio_summary = pd_data\
                        .groupby('condition',as_index=False)\
                        .agg({'id':'count',
                            })

conditio_summary['total'] = conditio_summary.apply(lambda x: 100*x.id/pd_data.shape[0],1)
conditio_summary

Unnamed: 0,condition,id,total
0,new,48352,53.724444
1,used,41648,46.275556


In [34]:

fig = px.bar(conditio_summary, x="condition", y="total", title="Condition")
fig.show()

In [17]:
pd_data.query("id==None").head()

Unnamed: 0,id,last_updated,start_time,non_mercado_pago_payment_methods,seller_id,accepts_mercadopago,condition,title,automatic_relist,base_price,...,descriptions,seller_address,stop_time,status,price,original_price,site_id,available_quantity,sub_status,variations


In [13]:
list(pd_data)

['id',
 'last_updated',
 'start_time',
 'non_mercado_pago_payment_methods',
 'seller_id',
 'accepts_mercadopago',
 'condition',
 'title',
 'automatic_relist',
 'base_price',
 'shipping',
 'tags',
 'pictures',
 'parent_item_id',
 'official_store_id',
 'deal_ids',
 'listing_type_id',
 'attributes',
 'catalog_product_id',
 'permalink',
 'buying_mode',
 'category_id',
 'initial_quantity',
 'sold_quantity',
 'thumbnail',
 'video_id',
 'currency_id',
 'secure_thumbnail',
 'warranty',
 'date_created',
 'descriptions',
 'seller_address',
 'stop_time',
 'status',
 'price',
 'original_price',
 'site_id',
 'available_quantity',
 'sub_status',
 'variations']

In [8]:
# variable = all_features[9]
# print(variable)
# data[variable]

In [9]:
# set(data[variable])

In [10]:
# list(filter(lambda x: x!=[], data[variable]))

In [11]:
# all_names = []
# for doc in data[variable]:
#     print(doc)

In [12]:
# all_names = []
# for doc in data[variable]:
#     for name in doc[0].keys():
#         all_names.append(name)
    
# all_names = list(set(all_names))
# all_names