In [101]:
%matplotlib inline

import json 

import pandas as pd

In [102]:
import matplotlib as mpl 

import matplotlib.pyplot as plt 
mpl.rc("lines", markeredgewidth=0.5)

In [103]:
plt.style.use('grayscale')

#print(plt.style.available)

In [111]:
import sys
sys.path.append('../src/')

from taxonomy import Taxonomy, get_taxonomy_from_file, get_column_from_file, get_tag_from_file

# Taxonomy

The first step of the script is creating a JSON to keep the taxonomy centralized and up-to-date.

In [105]:
with open('taxonomy.json') as f:
    data = json.load(f)
    
tax = Taxonomy(data)

In [106]:
with open('regions.json') as f:
    data = json.load(f)
    
regions = Taxonomy(data, '05-reg_')

In [108]:
SRC = '/Users/g4brielvs/Desktop/TAXONOMY_20180320/03-str_lst.csv'

tags = get_column_from_file(SRC)

prefix = '02-nam_'

SRC = '/Users/g4brielvs/Desktop/TAXONOMY_20180320/04-hed_nam.csv'

names = get_column_from_file(SRC)

In [109]:
SRC = '/Users/g4brielvs/Desktop/TAXONOMY_20180320/'

data = get_taxonomy_from_file(SRC, tags=tags, prefix=prefix, names=names, pivot='txID')

In [110]:
names = Taxonomy(data, '02-nam_')

# Data

We are working with S1 and S2. The dataset S1 contains information from the Czech Statistical Office (CSO) the S2 contains information from the mobile networks

In [8]:
SRC_HEADER = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/03-hed_tertiaryL1data.csv'
SRC_HEADER_NAMES = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/03-hed_nam.csv'

HEADER = get_column_from_file(SRC_HEADER)
NAMES_HEADER = get_column_from_file(SRC_HEADER_NAMES)

## Import names

In [9]:
SRC_NAMES = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/02-nam_LAU2.csv'

NAMES = pd.read_csv(SRC_NAMES, sep=',', names=NAMES_HEADER)

## Import data

In [10]:
SRC_S1 = '/Users/g4brielvs/Desktop/data_plot/1-00-nr_2-50-COM_3-01-L1_4-01-cso_5-01-ver_6-01-pop_7-03-lau2_8-99-na_9-10-res00_10-99-na_11-99-na_12-02-uni_13-99-na_14-99-na_15-99-na_16-99-na_17-99-na_18-99-na_19-99-na.csv'
SRC_S2 = '/Users/g4brielvs/Desktop/data_plot/1-33-ter_2-04-TACSU_3-01-L1_4-02-o2_5-01-ver_6-01-pop_7-03-lau2_8-03-ngt_9-06-res03_10-99-na_11-99-na_12-02-uni_13-99-na_14-99-na_15-99-na_16-99-na_17-99-na_18-99-na_19-99-na.csv'

S1 = pd.read_csv(SRC_S1, header=None, names=HEADER)
S2 = pd.read_csv(SRC_S2, header=None, names=HEADER)

## Import lists

In [11]:
SRC_REGIONS = '/Users/g4brielvs/Desktop/TAXONOMY_20180313/1-20-lst_2-06-PhaJZ_3-03-la2.csv'

REGIONS = get_column_from_file(SRC_REGIONS)

In [100]:
SRC_LIST_GROUPS = [
    '/Users/g4brielvs/Desktop/TAXONOMY_20180319/1-20-lst_2-30-weekDay7_3-98-nr_4-01-man_5-20180317105000.csv',
    '/Users/g4brielvs/Desktop/TAXONOMY_20180319/1-20-lst_2-31-workDay3_3-98-nr_4-01-man_5-20180317105000.csv',
    '/Users/g4brielvs/Desktop/TAXONOMY_20180319/1-20-lst_2-32-freeDay2_3-98-nr_4-01-man_5-20180317105000.csv']

# creates groups and respective criteria starting indexing with 2
GROUPS = dict([('D{}'.format(i), get_column_from_file(f)) for i, f in enumerate(SRC_LIST_GROUPS, 2)])

FileNotFoundError: File b'/Users/g4brielvs/Desktop/TAXONOMY_20180319/1-20-lst_2-30-weekDay7_3-98-nr_4-01-man_20180317105000.csv' does not exist

## Working on the data

### Merging datasets S1 and S2

In [13]:
df1 = S1[S1.apply(lambda x: x['rO1ID'] in REGIONS, axis=1)]
df2 = S2[S2.apply(lambda x: x['rO1ID'] in REGIONS, axis=1)]

In [14]:
df = pd.merge(df1, df2, on=['rO1ID'], how='outer')

df.set_index('rO1ID', inplace=True)

### Filtering out irrelevant information

Keeping only the information for calculating the averages 

In [15]:
df = df[['denID_y', 'data1_x', 'data1_y']]

### Creating weekday

At first, the criteria for creating the grops is based on weekday. In the future, this selection will be more flexible to include other types of filtering

In [16]:
df['denID_y'] = pd.to_datetime(df['denID_y'])

df['weekday'] = df['denID_y'].apply(lambda x: x.weekday())

### Creating dataframe for keeping the averages

In [17]:
f = {'data1_x':['first'], 'data1_y':['mean']}

In [18]:
result = df.groupby(['rO1ID']).agg(f).reset_index()

result.set_index('rO1ID', inplace=True)

result['D1'] = result['data1_x']['first']

result = result[['D1']]

### Grouping by criteria

In [19]:
#g = df.groupby(['rO1ID'])

In [20]:
for group, criteria in GROUPS.items():
    
    # filter dataframe by criteria
    D = df[df['weekday'].isin(criteria)]
    
    # group by id
    G = D.groupby(['rO1ID']).agg(f).reset_index()
    G.set_index('rO1ID', inplace=True)
    
    result[group] = G['data1_y']['mean']
    
    result['{}/D1'.format(group)] = result[group]/result['D1']

In [21]:
result = result.dropna(axis=0, how='any')

### Plotting

#### Title

In [80]:
# the title comes from the first file's 9th tag using CZnosp
tag = get_tag_from_file(SRC_S1, 8)
title = tax.get_text_from_tag(tag, name='CZnosp')

In [94]:
l = [get_tag_from_file(i, 1) for i in SRC_LIST_GROUPS]



[names.get_text_from_tag(i, name='CZnosp') for i in l]

['Not found in the taxonomy.',
 'Not found in the taxonomy.',
 'Not found in the taxonomy.']

#### Plot

In [23]:
ticks = ['{}/D1'.format(i) for i in GROUPS]

In [56]:
for index, row in result.iterrows():
            
    fig, ax = plt.subplots()
    
    # limits
    plt.ylim(0.0, 1.0)
    
    # titles 
    plt.suptitle(title, y=1.05, fontsize=17)
    
    ax.set_title(regions.get_text_from_tag(str(index), 'inWritingCZ')    
    
    #ax.set_xlabel('Distribution')
    #ax.set_ylabel('Value')
    
    ## add patch_artist=True option to ax.boxplot() 
    ## to get fill color
    #bp = plt.boxplot(df, patch_artist=True, zorder=-1)
    _, bp = pd.DataFrame.boxplot(result, column=ticks, return_type='both', patch_artist=True, zorder=-1)

    ax.set_axisbelow(True)
    ax.grid(True, color='gray', linestyle='--', linewidth=0.25)
        
    # change outline color, fill color and linewidth of the boxes
    for box in bp['boxes']:
        # change outline color
        box.set(color='#7570b3', linewidth=1)
        # change fill color
        box.set(facecolor = '#ffffff')

    # change color and linewidth of the whiskers
    #for whisker in bp['whiskers']:
    #    #whisker.set(color='#7570b3', linestyle='--', linewidth=2)
    #    whisker.set(linestyle='--', linewidth=2)

    ## change color and linewidth of the caps
    #for cap in bp['caps']:
    #    cap.set(color='#7570b3', linewidth=1)

    # change color and linewidth of the medians
    #for median in bp['medians']:
    #    median.set(color='#b2df8a', linewidth=1)

    # change the style of fliers and their fill
    for flier in bp['fliers']:
        flier.set(marker='o', markerfacecolor='black', markersize=2)
                
    for i, tick in enumerate(ticks, 1):
        plt.plot(i, row[tick], 'r^', markersize=4)    
        plt.plot(i, result[tick].mean(), '_',  markerfacecolor='black', markersize=7)
        
        ax.text(i, 0.9, 'mean: {:.2f}'.format(result[tick].mean()), horizontalalignment='center', size='x-small')
    
    ax.set_xticklabels(ticks, rotation='vertical')
    
    plt.savefig('data/foo_{}.eps'.format(index), bbox_inches='tight', format='eps', dpi=1000)
    
    plt.clf()
    plt.close(fig)