In [1]:
%matplotlib inline
import glob
import json
import pandas as pd
import datetime
import re
from ftfy import fix_text
import feather

In [2]:
df = pd.DataFrame(columns = ['user_id', 'login_name', 'gender', 'join_date', 'bio', 'country_id', 'region', 'city', 
                             'is_seller', 'transaction_buy_count', 'transaction_sold_count',
                            'first_name', 'last_name'])
df

Unnamed: 0,user_id,login_name,gender,join_date,bio,country_id,region,city,is_seller,transaction_buy_count,transaction_sold_count,first_name,last_name


In [3]:
def fix_text2(text):
    if type(text) is str:
        text = text.replace('\n',' ')
        text = text.replace('\r',' ')
        text = re.sub(' +',' ',text)
        return fix_text(text.rstrip())
    return None

In [4]:
def get_row(file):
    with open(file, 'r') as f:
        d = json.load(f)
        user_id = d['user_id']
        login_name = d['login_name']
        gender = d['gender']
        join_date = datetime.datetime.fromtimestamp(int(d['join_tsz'])).strftime('%Y-%m-%d')
        bio = fix_text2(d['bio'])
        country_id = d['country_id']
        region = fix_text2(d['region'])
        city = fix_text2(d['city'])
        is_seller = d['is_seller']
        transaction_buy_count = d['transaction_buy_count']
        transaction_sold_count = d['transaction_sold_count']
        first_name = fix_text2(d['first_name'])
        last_name = fix_text2(d['last_name'])
    
    return [user_id, login_name, gender, join_date, bio, country_id, region, city, 
            is_seller, transaction_buy_count, transaction_sold_count, first_name, last_name]

In [5]:
file_list = glob.glob("../_outputs/owners/*.json")
for file in file_list:
    df.loc[len(df)] = get_row(file)

In [6]:
df.head(10)

Unnamed: 0,user_id,login_name,gender,join_date,bio,country_id,region,city,is_seller,transaction_buy_count,transaction_sold_count,first_name,last_name
0,108706714,atelierkemet,male,2017-05-10,www.atelierkemet.com,103.0,,Lens-Lestang,True,0,0,Anthony,Bres
1,19496469,bahiadelsol,private,2012-02-02,"Entre passion et artisanat, mon atelier, nich√©...",103.0,Rhone-alpes,Lyon,True,68,300,Bahia,Del Sol
2,62358769,aliciavente,female,2015-02-26,,103.0,,Bo√´n-sur-Lignon,True,2,1,Alicia,VENTE
3,94673146,GrandSiecleBijoux,female,2016-10-05,,,,,True,0,6,Grand,Si√®cle
4,101002670,angliquebourchenin,female,2016-12-31,,,,,True,0,1,Etats,D'√¢me
5,59548002,ClairesPlumes,female,2015-01-02,Chaque cr√©ation de Claire's Plumes est une pi√®...,103.0,,B√©thisy-Saint-Pierre,True,28,0,Claire,Russo
6,28941673,Chikhuahua,female,2013-01-02,,103.0,,Algrange,True,1,6,Fanny,Bada
7,64734814,elicsenbonnesseur,female,2015-04-14,,103.0,,Chartres,True,1,16,Elicsen,Patuel
8,112769329,AneleDesign,private,2017-05-08,Bienvenue dans ma boutique ! Cr√©ations de bijo...,103.0,,Villepreux,True,0,202,Anele,Design
9,11318854,Tsunade23,private,2010-10-12,,,,,True,2,0,,


In [7]:
df.describe()

Unnamed: 0,user_id,login_name,gender,join_date,bio,country_id,region,city,is_seller,transaction_buy_count,transaction_sold_count,first_name,last_name
count,14415,14415,14415,14415,14415.0,10544,14415.0,14415.0,14415,14415,14415,12984.0,8343.0
unique,14415,14415,82,2733,7386.0,24,56.0,3859.0,2,431,850,6081.0,6983.0
top,117297897,vtoc,female,2017-07-03,,103,,,True,0,0,,
freq,1,1,9699,500,7023.0,10489,13784.0,3190.0,14414,7860,4266,228.0,228.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14415 entries, 0 to 14414
Data columns (total 13 columns):
user_id                   14415 non-null object
login_name                14415 non-null object
gender                    14415 non-null object
join_date                 14415 non-null object
bio                       14415 non-null object
country_id                10544 non-null object
region                    14415 non-null object
city                      14415 non-null object
is_seller                 14415 non-null object
transaction_buy_count     14415 non-null object
transaction_sold_count    14415 non-null object
first_name                12984 non-null object
last_name                 8343 non-null object
dtypes: object(13)
memory usage: 1.5+ MB


In [None]:
# feather.write_dataframe(df, 'user_profiles.feather')

In [9]:
df = pd.read_excel('../_outputs/user_profiles.xlsx')

In [10]:
female = df[df['gender'] == 'female']['user_id'].count()
female

9699

In [11]:
male = df[df['gender'] == 'male']['user_id'].count()
male

1234

In [12]:
ratio = female /(female + male)
ratio

0.88713070520442694

In [13]:
df['transaction_buy_count'] = df['transaction_buy_count'].astype(int)
df['transaction_sold_count'] = df['transaction_sold_count'].astype(int)

In [14]:
df[df['gender'].isin(['female', 'male'])].groupby('gender')['transaction_buy_count', 'transaction_sold_count'].mean()

Unnamed: 0_level_0,transaction_buy_count,transaction_sold_count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,26.517373,98.414579
male,4.924635,105.902755


In [15]:
df.gender.unique()

array(['female', 'private', 'male', nan, 'family', 'Artiste', 'couple',
       'furry', 'artiste', 'Marque', 'Couple', 'witch',
       'Magicienne de li√®ge', 'Studio', 'Magasin', 'Collectif cr√©atif',
       'groupe', 'mari√©', 'Both', 'Entreprise', 'Artisan', 'Famille',
       'PRO', 'both', 'Boutique', 'Cr√©ateur', 'L√©zard', 'Lak&#39;s Artes',
       'cr√©atrice', 'galerie d&#39;art', 'genderfluid', 'Team BUBU',
       'SylHulaGirl', 'Alien', 'Viro', 'guru', 'Plasticien',
       'ourse-papillon', 'Etre de lumi√®re', 'Petite entreprise',
       'GUILLOU Corinne', 'Agender', 'Poulpe',
       'Petite cr√©atrice de carnets et d&#39;accessoires', 'Cr√©atrice',
       'h/f', 'Princesse', 'socks', '3 femmes', 'Unisexe', 'Bin√¥me',
       'Synergie', 'Marque de mobilier design scandinave', '√Çme', 'Soeurs',
       'Deux soeurs qui s&#39;adorent !', 'femme chat', 'man en vrouw',
       'chat', 'Atelier de design', '1 homme et 1 femme', 'PROFESSIONNEL',
       'Soci√©t√©', 'F&H', 'Renard', '

In [16]:
# df.sort_values('count',ascending=False).groupby('city')['user_id'].count()
df[['city']].groupby(['city'])['city'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(20)

Unnamed: 0,city,count
2460,Paris,1605
1929,Lyon,377
3462,Toulouse,305
422,Bordeaux,249
2311,Nantes,219
1839,Lille,174
2024,Marseille,155
2218,Montpellier,151
2759,Rennes,125
2329,Nice,123
