# Reshape Data for Tableau Dashboard

Aim: split (explode) the column *cities_served*, so that we can sum the number of violations per city and plot them on a map.  

In [62]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [52]:
# read data:
data = pd.read_csv('../data/data_input_for_model.csv')
data.head()

Unnamed: 0,pwsid,pws_name,primacy_agency_code,pws_type_code,gw_sw_code,owner_type_code,population_served_count,primary_source_code,is_wholesaler_ind,is_school_or_daycare_ind,...,counties_served,ansi_entity_code,year,n_viol,n_pesticide_viol,had_violation,had_pesticide_violation,had_violation_lastyear,had_pesticide_violation_lastyear,EPEST_LOW_KG
0,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2012,0.0,0.0,0,0,,,12871.8
1,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2013,0.0,0.0,0,0,0.0,0.0,17060.6
2,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2014,0.0,0.0,0,0,0.0,0.0,11132.3
3,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2015,0.0,0.0,0,0,0.0,0.0,14271.5
4,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2016,0.0,0.0,0,0,0.0,0.0,13550.1


In [5]:
data.shape

(62826, 22)

In [2]:
data.cities_served.value_counts()

WINDHAM                                                                                                                                                                              486
BROOKFIELD                                                                                                                                                                           444
KILLINGTON                                                                                                                                                                           432
CHARLESTOWN                                                                                                                                                                          414
PLYMOUTH                                                                                                                                                                             414
                                                                           

In [53]:
# data.cities_served = data.cities_served.apply(lambda x: [x])
data.cities_served = data.cities_served.apply(lambda x: x.split(','))

In [55]:
data_dashboard = data.explode('cities_served')

In [56]:
print(data_dashboard.shape)
data_dashboard.head()

(64482, 22)


Unnamed: 0,pwsid,pws_name,primacy_agency_code,pws_type_code,gw_sw_code,owner_type_code,population_served_count,primary_source_code,is_wholesaler_ind,is_school_or_daycare_ind,...,counties_served,ansi_entity_code,year,n_viol,n_pesticide_viol,had_violation,had_pesticide_violation,had_violation_lastyear,had_pesticide_violation_lastyear,EPEST_LOW_KG
0,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2012,0.0,0.0,0,0,,,12871.8
1,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2013,0.0,0.0,0,0,0.0,0.0,17060.6
2,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2014,0.0,0.0,0,0,0.0,0.0,11132.3
3,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2015,0.0,0.0,0,0,0.0,0.0,14271.5
4,CT0010024,ANDOVER TOWN HALL & FIRE DEPARTMENT,CT,TNCWS,GW,P,25,GW,N,N,...,Tolland,13.0,2016,0.0,0.0,0,0,0.0,0.0,13550.1


In [57]:
data_dashboard.cities_served.value_counts()

WINDHAM            498
BROOKFIELD         486
KILLINGTON         432
PLYMOUTH           426
CHARLESTOWN        414
                  ... 
WEST WINDSOR         6
LANDAFF              6
STETSONTOWN TWP      6
NATICK               6
PROVINCETOWN         6
Name: cities_served, Length: 1126, dtype: int64

In [54]:
data.cities_served.value_counts()

[WINDHAM]                          486
[BROOKFIELD]                       444
[KILLINGTON]                       432
[PLYMOUTH]                         414
[CHARLESTOWN]                      414
                                  ... 
[T02 R10 WELS]                       6
[CHAIN OF PONDS TWP]                 6
[TIM POND TWP]                       6
[NEW PORTLAND, PIERCE POND TWP]      6
[COBURN GORE]                        6
Name: cities_served, Length: 1205, dtype: int64

In [58]:
# SAVE THE NEWLY CREATED DATA:
data_dashboard.to_csv('../data/data_for_dashboard.csv', index=False)

In [72]:
data_dashboard.columns


Index(['pwsid', 'pws_name', 'primacy_agency_code', 'pws_type_code',
       'gw_sw_code', 'owner_type_code', 'population_served_count',
       'primary_source_code', 'is_wholesaler_ind', 'is_school_or_daycare_ind',
       'service_connections_count', 'cities_served', 'counties_served',
       'ansi_entity_code', 'year', 'n_viol', 'n_pesticide_viol',
       'had_violation', 'had_pesticide_violation', 'had_violation_lastyear',
       'had_pesticide_violation_lastyear', 'EPEST_LOW_KG'],
      dtype='object')

In [84]:
n_viol_by_city = data_dashboard.groupby(['cities_served'])['n_viol'].sum()
n_viol_by_city

cities_served
ABBOT           0.0
ABINGTON        9.0
ACTON          40.0
ACUSHNET        5.0
ACWORTH         0.0
               ... 
WORCESTER       6.0
WORTHINGTON    17.0
WRENTHAM        0.0
YARMOUTH        0.0
YORK            8.0
Name: n_viol, Length: 1126, dtype: float64

In [92]:
# NUMBER OF CITIES SERVEED BY A WS WHERE ONE VIOLATION WAS OBSERVED:
# (cities are usually served by numerous water systems)
pd.Series(np.where(n_viol_by_city>1,1,0)).value_counts()

1    735
0    391
dtype: int64

In [93]:
data.year.value_counts()

2015    10471
2014    10471
2013    10471
2012    10471
2017    10471
2016    10471
Name: year, dtype: int64