In [1]:
%matplotlib inline

import json 

import pandas as pd

In [2]:
import matplotlib as mpl 

import matplotlib.pyplot as plt 

In [3]:
from ggplot import theme_bw

theme = theme_bw()

mpl.rcParams.update(theme.get_rcParams())

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [4]:
import sys
sys.path.append('../mobility_data_retrieval/')

from taxonomy import Taxonomy, get_taxonomy_from_file, get_column_from_file, get_tag_from_file

# Taxonomy

The first step of the script is creating a JSON to keep the taxonomy centralized and up-to-date.

In [5]:
with open('taxonomy.json') as f:
    data = json.load(f)

In [6]:
tax = Taxonomy(data)

In [7]:
with open('regions.json') as f:
    data = json.load(f)

In [8]:
regions = Taxonomy(data, '05-reg_')

# Data

We are working with S1 and S2. The dataset S1 contains information from the Czech Statistical Office (CSO) the S2 contains information from the mobile networks

In [9]:
SRC_HEADER = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/03-hed_tertiaryL1data.csv'
SRC_HEADER_NAMES = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/03-hed_nam.csv'

HEADER = get_column_from_file(SRC_HEADER)
NAMES_HEADER = get_column_from_file(SRC_HEADER_NAMES)

## Import names

In [10]:
SRC_NAMES = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/02-nam_LAU2.csv'

NAMES = pd.read_csv(SRC_NAMES, sep=',', names=NAMES_HEADER)

## Import data

In [11]:
SRC_S1 = '/Users/g4brielvs/Desktop/data_plot/1-00-nr_2-50-COM_3-01-L1_4-01-cso_5-01-ver_6-01-pop_7-03-lau2_8-99-na_9-10-res00_10-99-na_11-99-na_12-02-uni_13-99-na_14-99-na_15-99-na_16-99-na_17-99-na_18-99-na_19-99-na.csv'
SRC_S2 = '/Users/g4brielvs/Desktop/data_plot/1-33-ter_2-04-TACSU_3-01-L1_4-02-o2_5-01-ver_6-01-pop_7-03-lau2_8-03-ngt_9-06-res03_10-99-na_11-99-na_12-02-uni_13-99-na_14-99-na_15-99-na_16-99-na_17-99-na_18-99-na_19-99-na.csv'

S1 = pd.read_csv(SRC_S1, header=None, names=HEADER)
S2 = pd.read_csv(SRC_S2, header=None, names=HEADER)

## Import lists

In [22]:
SRC_LIST_REGIONS = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/1-20-lst_2-06-PhaJZ_3-03-la2.csv'

LIST_REGIONS = get_column_from_file(SRC_LIST)

## Working on the data

### Merging datasets S1 and S2

In [13]:
df1 = S1[S1.apply(lambda x: x['rO1ID'] in LIST_REGIONS, axis=1)]
df2 = S2[S2.apply(lambda x: x['rO1ID'] in LIST_REGIONS, axis=1)]

len(df1), len(df2)

(68, 900)

In [15]:
df = pd.merge(df1, df2, on=['rO1ID'], how='outer')

df.describe()

Unnamed: 0,rO1ID,data1_x,casUsID_y,data1_y
count,908.0,908.0,900.0,900.0
mean,537614.870044,1579.912996,3.0,383.674444
std,13627.848082,2907.283135,0.0,772.273566
min,531057.0,88.0,3.0,8.0
25%,531740.0,300.5,3.0,79.0
50%,533203.0,607.0,3.0,169.0
75%,539198.0,1110.0,3.0,328.0
max,599727.0,18819.0,3.0,6819.0


### Filtering out irrelevant information

Keeping only the information for calculating the averages 

In [18]:
columns_to_keep = ['rO1ID', 'denID_y', 'data1_x', 'data1_y']

df = df[columns_to_keep]

### Creating weekday

At first, the criteria for creating the grops is based on weekday. In the future, this selection will be more flexible to include other types of filtering

In [21]:
df['denID_y'] = pd.to_datetime(df['denID_y'])

df['weekday'] = df['denID_y'].apply(lambda x: x.weekday())

### Grouping by criteria

In [17]:
for l in in 

groups = ['D{i}}'.format(i) for in in LIST]

In [18]:
#g = df.groupby(['rO1ID'])

D2 = df[df['weekday'].isin(range(0, 7))]
D3 = df[df['weekday'].isin([1, 2, 3])]
D4 = df[df['weekday'].isin([5, 6])]

In [19]:
f = {'data1_x':['first'], 'data1_y':['mean']}

G2 = D2.groupby(['rO1ID']).agg(f).reset_index()
G2.set_index('rO1ID', inplace=True)

G3 = D3.groupby(['rO1ID']).agg(f).reset_index()
G3.set_index('rO1ID', inplace=True)

G4 = D4.groupby(['rO1ID']).agg(f).reset_index()
G4.set_index('rO1ID', inplace=True)

In [20]:
df = pd.DataFrame(data=None, index=G2.index)

df['G1'] = G2['data1_x']
df['G2'] = G2['data1_y']
df['G3'] = G3['data1_y']
df['G4'] = G4['data1_y']

In [39]:
df['avg2'] = df['G2']/df['G1']
df['avg3'] = df['G3']/df['G1']
df['avg4'] = df['G4']/df['G1']

df = df.dropna(axis=0, how='any')

df

Unnamed: 0_level_0,G1,G2,G3,G4,avg2,avg3,avg4
rO1ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
531057,18819,5297.2,6291.285714,4171.75,0.281481,0.334305,0.221678
531103,401,106.333333,121.714286,91.5,0.26517,0.303527,0.22818
531138,1079,225.733333,275.571429,182.75,0.209206,0.255395,0.16937
531154,712,138.266667,167.714286,111.5,0.194195,0.235554,0.156601
531171,902,239.133333,300.285714,187.5,0.265115,0.332911,0.207871
531189,6951,1266.933333,1584.714286,953.25,0.182266,0.227984,0.137139
531243,1582,295.533333,360.142857,236.5,0.18681,0.22765,0.149494
531316,799,220.2,261.142857,188.25,0.275594,0.326837,0.235607
531332,212,65.733333,78.428571,54.75,0.310063,0.369946,0.258255
531472,1136,333.533333,415.0,262.25,0.293603,0.365317,0.230854


### Plotting

#### Title

In [22]:
tag = get_tag_from_file(SRC_S1, 8)
title = tax.get_text_from_tag(tag, name='CZnosp')

title

'BydliciSLDB'

In [33]:
for index, row in df.iterrows():
            
    fig, ax = plt.subplots()
    
    # limits
    plt.ylim(0.0, 1.0)
    
    # titles 
    plt.suptitle(title, y=1.05, fontsize=17)
    
    name = regions.get_text_from_tag(tag=str(index), name='inWritingCZ')
    ax.set_title(name)
    
    #ax.set_xlabel('Distribution')
    #ax.set_ylabel('Value')
    
    ## add patch_artist=True option to ax.boxplot() 
    ## to get fill color
    #bp = plt.boxplot(df, patch_artist=True, zorder=-1)
    _, bp = pd.DataFrame.boxplot(df, column=['avg2', 'avg3', 'avg4'], return_type='both', patch_artist=True, zorder=1)
        
    for tick, label in zip([1, 2, 3], ax.get_xticklabels()):
        ax.text(tick, 0.75, bp['medians'][0].get_ydata()[0], horizontalalignment='center', size='x-small')
    

                
    ax.set_xticklabels(['D2 / D1', 'D3 / D4', 'D4 / D1'], rotation='vertical')
    
    plt.plot(1, row['avg2'], 'rD', markersize=5)
    plt.plot(2, row['avg3'], 'rD', markersize=5)
    plt.plot(3, row['avg4'], 'rD', markersize=5)
    
    plt.savefig('data/foo_{}.eps'.format(index), bbox_inches='tight', format='eps', dpi=1000)
    
    plt.clf()
    plt.close(fig)

In [None]:
create_plots(S1, S2, groups=list(), index=8, name='CZnosp')