In [1]:
%matplotlib inline
import glob
import json
import pandas as pd
import datetime
import re
from ftfy import fix_text
import matplotlib.pyplot as plt

In [2]:
df = pd.DataFrame(columns = ['shop_name', 'user_id', 'listing_active_count', 'digital_listing_count',
                             'accepts_custom_requests', 'num_favorers',
                             'creation_date', 'currency_code', 'languages', 'title', 'announcement',
                             'sale_message', 'digital_sale_message', 'about'])
df

Unnamed: 0,shop_name,user_id,listing_active_count,digital_listing_count,accepts_custom_requests,num_favorers,creation_date,currency_code,languages,title,announcement,sale_message,digital_sale_message,about


In [3]:
def fix_text2(text):
    if type(text) is str:
        text = text.replace('\n',' ')
        text = text.replace('\r',' ')
        text = re.sub(' +',' ',text)
        return fix_text(text.rstrip())
    return None

In [4]:
def get_row(file):
    with open(file, 'r') as f:
        d = json.load(f)
        shop_name = d['shop_name']
        user_id = str(d['user_id'])
        listing_active_count = d['listing_active_count']
        digital_listing_count = d['digital_listing_count'] 
        accepts_custom_requests = d['accepts_custom_requests']
        num_favorers = d['num_favorers']
        creation_date = datetime.datetime.fromtimestamp(int(d['creation_tsz'])).strftime('%Y-%m-%d')
        currency_code = d['currency_code']
        languages = d['languages']
        title = fix_text2(d['title'])
        announcement = fix_text2(d['announcement'])
        sale_message = fix_text2(d['sale_message'])
        digital_sale_message = fix_text2(d['digital_sale_message'])
        
        if d['About'] :
            about = d['About']['related_links']
        else:
            about = None
        
    return [shop_name, user_id, listing_active_count, digital_listing_count, accepts_custom_requests, num_favorers, 
            creation_date, currency_code, languages, title, announcement, sale_message, digital_sale_message, about]

In [5]:
file_list = glob.glob("../_outputs/shops/*.json")
for file in file_list:
    df.loc[len(df)] = get_row(file) 

In [6]:
df.head(10)

Unnamed: 0,shop_name,user_id,listing_active_count,digital_listing_count,accepts_custom_requests,num_favorers,creation_date,currency_code,languages,title,announcement,sale_message,digital_sale_message,about
0,BijouxElte,18842258,12,0,False,195,2012-01-02,EUR,"[fr, en-US, it]",Bijoux Elté: one and only one.,Creazioni dedicate alla donna che ama indossar...,Thank you for your interest and order. Please ...,,"{'link-1': {'title': 'shop-website', 'url': 'h..."
1,0chapo,44974775,26,0,False,4,2014-02-27,EUR,[fr],,,,,
2,1000ola,88485431,159,0,True,76,2016-03-23,EUR,"[fr, de]",Bracelets et Colliers à la mode Créations excl...,1000ola sur YouTube: https://www.youtube.com/c...,,,
3,1000perlescreation,116718282,104,0,False,0,2017-08-17,EUR,[fr],,"Bienvenue dans ma boutique, je suis créatrice ...",Merci d'avoir choisi l'une de mes créations :-),,
4,1001BijouxOrigami,51366124,44,0,True,10,2017-06-12,EUR,"[fr, es, de, en-US, it, pt, ja, nl]",Bijoux origami et accessoires de coiffure,Bienvenue dans ma boutique Mille et un bijoux ...,,,"[{'title': 'facebook', 'url': 'http://www.face..."
5,1001coquesliberty,119186737,1290,0,True,39,2017-07-21,EUR,[fr],1001 coques iPhone et Samsung en Liberty,Bonne visite,"Merci de votre commande, vous avez bien précis...",,
6,1001Logos,92689432,39,23,True,30,2016-08-30,EUR,"[fr, en-US]","Logos, illustration and digital creation",PROMOTION FESTIVAL OF THE END OF YEAR! From 23...,Thank you for your order. I come to you very s...,Thank you for your order. Do not hesitate to c...,"[{'title': 'shop-website', 'url': 'http://www...."
7,1001perlesCreations,113269402,663,0,False,27,2017-07-21,EUR,"[fr, de, en-US, es, it, nl]",Que l'aventure créative continue........,"Livraison nationale gratuite dès 30 € d'achat,...",,,
8,1001tissusdePatoune,113072488,61,0,False,27,2017-07-14,EUR,[fr],,,,,
9,100Drinette,48245022,23,0,False,3,2014-11-20,EUR,[fr],Guirlandes,,,,


In [7]:
# ExcelWriter pour ne pas convertir les liens
# https://stackoverflow.com/questions/35440528/how-to-save-in-xlsx-long-url-in-cell-using-pandas
# writer = pd.ExcelWriter(r'../_outputs/shop_details.xlsx', engine='xlsxwriter', options={'strings_to_urls': False})
# df.to_excel(writer)
# writer.close()

In [8]:
df = pd.read_excel('../_outputs/shop_details.xlsx')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14418 entries, 0 to 14417
Data columns (total 14 columns):
shop_name                  14418 non-null object
user_id                    14418 non-null int64
listing_active_count       14418 non-null int64
digital_listing_count      14418 non-null int64
accepts_custom_requests    14418 non-null bool
num_favorers               14418 non-null int64
creation_date              14418 non-null object
currency_code              14418 non-null object
languages                  14418 non-null object
title                      11827 non-null object
announcement               10190 non-null object
sale_message               5908 non-null object
digital_sale_message       588 non-null object
about                      6932 non-null object
dtypes: bool(1), int64(4), object(9)
memory usage: 1.6+ MB


In [11]:
df.describe()

Unnamed: 0,user_id,listing_active_count,digital_listing_count,num_favorers
count,14418.0,14418.0,14418.0,14418.0
mean,76402540.0,78.826536,1.159453,187.05937
std,35263240.0,194.759846,18.398279,1206.38982
min,13730.0,0.0,0.0,0.0
25%,47318740.0,11.0,0.0,3.0
50%,82044080.0,30.0,0.0,15.0
75%,110615800.0,78.75,0.0,70.0
max,123365400.0,7883.0,1016.0,64509.0


In [15]:
# df.listing_active_count.sort_values(['sort_values'], ascending=False).plot()