In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import re
import string

from dbfread import DBF

import operator
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('white')

In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import standardize

In [3]:
#pickle objects
def make_pickle(obj, filename):
    with open(filename, 'wb') as picklefile:
        pickle.dump(obj, picklefile)

#open pickled objects
def open_pickle(filename):
    with open(filename, 'rb') as picklefile:
        return pickle.load(picklefile)

### Load and merge schools and PWS data

In [4]:
#load school df, remove punct, make upper case, keep relevant columns
schools = open_pickle('../data/interim/schools.pkl')
schools['NAME_y'].fillna(schools['NAME_x'], inplace=True)
schools['NAME_y'] = schools['NAME_y'].apply(lambda x: x.translate(None, string.punctuation).upper())
schools = schools[['NAME_y', 'ORGCODE', 'RESULT', 'TOWN_x', 'YEAR_BUILT', 'TOTAL_VAL', 'TYPE', 'EJ']]

In [5]:
len(schools)

875

In [6]:
#drop schools without building data
schools.dropna(inplace=True)

In [7]:
schools.isnull().sum()

NAME_y        0
ORGCODE       0
RESULT        0
TOWN_x        0
YEAR_BUILT    0
TOTAL_VAL     0
TYPE          0
EJ            0
dtype: int64

In [8]:
len(schools)

791

In [9]:
#load pws df, remove punct, make upper case
pws = open_pickle('../data/interim/sch_pws.pkl')
pws['PWS_NAME'] = pws['PWS_NAME'].apply(lambda x: x.translate(None, string.punctuation).upper())

In [10]:
school_pws = pws[pws['PWS_NAME'].str.contains('SCHOOL')]
town_pws = pws[~pws['PWS_NAME'].str.contains('SCHOOL')]

In [11]:
#create substring for matching school names to pws name
schools['MATCH_NAME'] = schools['NAME_y'].apply(lambda x: re.findall('(\w+\s+\w+)', x))
schools.loc[schools['NAME_y'] == 'GOALS', 'MATCH_NAME'] = ['GOALS']
schools.loc[schools['NAME_y'] == 'MOPPETS', 'MATCH_NAME'] = ['MOPPETS']

In [12]:
schools['MATCH_NAME'] = schools['MATCH_NAME'].apply(lambda x: x[0])

In [13]:
school_pws['MATCH_NAME'] = school_pws['PWS_NAME'].apply(lambda x: re.findall('(\w+\s+\w+)', x)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
match_name_merge = schools.merge(school_pws, how='left', on='MATCH_NAME', indicator=True)

In [15]:
match_name_merge['_merge'].value_counts()

left_only     774
both           17
right_only      0
Name: _merge, dtype: int64

In [16]:
match_name = match_name_merge[match_name_merge['_merge'] == 'both']
schools = match_name_merge[match_name_merge['_merge'] == 'left_only']

In [17]:
drop_cols = list(school_pws.columns)
drop_cols.append('_merge')
schools.drop(labels=drop_cols, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [18]:
len(schools)

774

In [19]:
schools.columns

Index([    u'NAME_y',    u'ORGCODE',     u'RESULT',     u'TOWN_x',
       u'YEAR_BUILT',  u'TOTAL_VAL',       u'TYPE',         u'EJ'],
      dtype='object')

In [20]:
town_name = schools.merge(town_pws, how = 'left', left_on='TOWN_x', right_on='TOWN', indicator=True)

In [21]:
town_name['_merge'].value_counts()

both          747
left_only     119
right_only      0
Name: _merge, dtype: int64

In [22]:
town_name.dropna(inplace=True)
town_name.drop_duplicates(inplace=True)

In [23]:
#town_name[town_name[['NAME_y', 'TOWN']].duplicated(keep=False)].sort_values('NAME_y')[['NAME_y', 'TOWN_x']]
#town_name.drop_duplicates(['NAME_y', 'TOWN'], inplace=True)

In [24]:
sch_pws = pd.concat([town_name, match_name])

In [25]:
len(sch_pws)

691

In [26]:
sch_pws.drop_duplicates(subset=['NAME_y', 'TOWN_x'], inplace=True)

In [27]:
schools = sch_pws[['ORGCODE', 'NAME_y', 'TOWN', 'EJ', 'YEAR_BUILT', 'TOTAL_VAL', 'TYPE', 'RESULT', 'PWS_ID', 'PWS_NAME', 'PWS_CLASS', 'COUNT_CONN', 'TOTAL_INTAKES', 'GW_RATIO', 'SURVEY', 'PB90_RESULT', 'PB_90', 'MWRA']]

In [28]:
schools.rename(columns={'ORGCODE': 'SCH_ID',
               'NAME_y': 'SCH_NAME'},
              inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


### Create features

** Potential features related to parcels: **
* TOTAL_VAL: Current total assessed value for land and structures
* _LS__DATE: Last sale date formatted as YYYYMMDD - DROP FOR NOW_
* YEAR_BUILT: format YYYY

** Potential features related to schools: **
* TYPE: Identifies school type:
    * PUB - Public
    * PRI - Private
    * CHA - Charter
    * SPE - Special Education (Approved)
    * SPU - Special Education (Unapproved)
* _GRADES: Grade levels offered at the school - DROP FOR NOW_
* EJ

** Potential features related to PWS: **
* COUNT_CONN:
* TOTAL_INTAKES:
* GW_RATIO:
* SW_RATIO:
* SURVEY:
* PB_90:

** Target **
* RESULTS: max concentration (mg/L) of lead in samples collected from given school

In [29]:
schools.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 672 entries, 0 to 712
Data columns (total 18 columns):
SCH_ID           672 non-null object
SCH_NAME         672 non-null object
TOWN             672 non-null object
EJ               672 non-null float64
YEAR_BUILT       672 non-null object
TOTAL_VAL        672 non-null object
TYPE             672 non-null object
RESULT           672 non-null float64
PWS_ID           672 non-null object
PWS_NAME         672 non-null object
PWS_CLASS        672 non-null object
COUNT_CONN       672 non-null object
TOTAL_INTAKES    672 non-null float64
GW_RATIO         672 non-null float64
SURVEY           672 non-null float64
PB90_RESULT      672 non-null object
PB_90            672 non-null float64
MWRA             672 non-null float64
dtypes: float64(7), object(11)
memory usage: 99.8+ KB


In [30]:
for column in ['YEAR_BUILT', 'RESULT', 'TOTAL_VAL', 'COUNT_CONN', 'PB90_RESULT']:
    schools[column] = pd.to_numeric(schools[column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [31]:
#drop schools without year built data
schools = schools[schools['YEAR_BUILT'] != 0]
schools.reset_index(drop=True, inplace=True)

In [32]:
#fill empty val strings with NaN
schools['TOTAL_VAL'] = schools['TOTAL_VAL'].apply(lambda x: np.nan if not x else x)

### Clean features

In [33]:
dummies = ['SURVEY', 'PB_90', 'MWRA', 'EJ']
categoricals = ['TYPE', 'PWS_CLASS']
numericals = ['TOTAL_VAL', 'YEAR_BUILT', 'COUNT_CONN', 'TOTAL_INTAKES', 'GW_RATIO', 'PB90_RESULT']
target = ['RESULT']
features = dummies + categoricals + numericals + target
X = schools[features]

In [34]:
#fill missing total parcel value with median
#X.loc[:,'TOTAL_VAL'] = X.loc[:,'TOTAL_VAL'].fillna(X['TOTAL_VAL'].median())
X['TOTAL_VAL'].fillna(X['TOTAL_VAL'].median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [35]:
#convert YEAR_BUILT to year since 1800 (earliest YEAR_BUILT)
#X.loc[:,'YEAR_SINCE_1800'] = X['YEAR_BUILT'].apply(lambda x: x - 1800)
X['YEAR_SINCE_1800'] = X['YEAR_BUILT'].apply(lambda x: x - 1800)
X = X.drop('YEAR_BUILT', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [36]:
#create dummy variables for categoricals
#all_dummies = pd.DataFrame(schools['RESULT'])
#for dummy in dummies:
#    all_dummies = all_dummies.merge(pd.DataFrame(X.pop(dummy)), left_index=True, right_index=True)

for category in categoricals:
    cat_dummies = pd.get_dummies(X[category], drop_first=True)
    X = X.merge(cat_dummies, left_index=True, right_index=True)
    X = X.drop(category, axis=1)

In [37]:
dummy_cols = {'PRI': 'SCH_PRI',
              'PUB': 'SCH_PUB', 
              'SPE': 'SCH_SPE', 
              'SPU': 'SCH_SPU', 
              'NC': 'PWS_NC', 
              'NTNC': 'PWS_NTNC'}
X.rename(columns=dummy_cols, inplace=True)

In [38]:
pb_al = 0.015

In [39]:
#for whether a result is greater than action level
X.loc[X['RESULT'] > pb_al, 'EXCEED'] = 1
X['EXCEED'].fillna(0, inplace=True)
X.drop('RESULT', axis=1, inplace=True)

In [40]:
X.head()

Unnamed: 0,SURVEY,PB_90,MWRA,EJ,TOTAL_VAL,COUNT_CONN,TOTAL_INTAKES,GW_RATIO,PB90_RESULT,YEAR_SINCE_1800,SCH_PRI,SCH_PUB,SCH_SPE,SCH_SPU,PWS_NC,PWS_NTNC,EXCEED
0,1.0,1.0,0.0,1.0,5357900.0,43034,11.0,0.0,0.003,140,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,0.0,0.0,15958200.0,11272,5.0,0.6,0.003,162,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,8307600.0,11272,5.0,0.6,0.003,151,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1.0,0.0,0.0,2403400.0,11272,5.0,0.6,0.003,140,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,5306600.0,11272,5.0,0.6,0.003,136,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [41]:
#use all PWS 
X.drop(['PB90_RESULT'], axis=1, inplace=True)

#use only PWS with pb90 results
#X = X[X['PB90_RESULT'] >= 0]
#X.drop(['PB_90'], axis=1, inplace=True)

In [42]:
y = X.pop('EXCEED')

In [43]:
len(X)

580

In [44]:
#train/test split, then cv on only training data, score on testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=120, random_state=4444, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=80, random_state=4444)

In [45]:
for df in [X_train, X_test, y_train, y_test]:
    df.reset_index(inplace=True, drop=True)

In [46]:
X_train.head()

Unnamed: 0,SURVEY,PB_90,MWRA,EJ,TOTAL_VAL,COUNT_CONN,TOTAL_INTAKES,GW_RATIO,YEAR_SINCE_1800,SCH_PRI,SCH_PUB,SCH_SPE,SCH_SPU,PWS_NC,PWS_NTNC
0,1.0,1.0,1.0,0.0,14214000.0,9242,4.0,0.0,162,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,17544500.0,42650,4.0,0.0,131,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,9679600.0,19809,2.0,0.0,201,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,1.0,1274400.0,42650,4.0,0.0,163,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,1.0,47661500.0,43034,11.0,0.0,190,0.0,1.0,0.0,0.0,0.0,0.0


In [47]:
s_cols = ['TOTAL_VAL', 'YEAR_SINCE_1800', 'COUNT_CONN', 'TOTAL_INTAKES']
X_train_s = X_train[s_cols]
X_test_s = X_test[s_cols]

In [48]:
#standardize numericals (train and test separately)
scaler = preprocessing.StandardScaler()
X_train_s = pd.DataFrame(scaler.fit_transform(X_train_s))
X_train_s.columns = s_cols

X_test_s = pd.DataFrame(scaler.transform(X_test_s))
X_test_s.columns = s_cols

In [49]:
X_test_s.head()

Unnamed: 0,TOTAL_VAL,YEAR_SINCE_1800,COUNT_CONN,TOTAL_INTAKES
0,0.005019,-1.851993,-0.873431,-0.330923
1,-0.812502,-0.294093,2.234522,-0.330923
2,-0.003957,1.073043,-0.171449,-0.330923
3,0.135122,0.596135,2.234522,-0.330923
4,1.115713,0.723311,-0.938915,-0.061499


In [50]:
#standardize numericals (train and test separately)
X_train.drop(labels=s_cols, axis=1, inplace=True)
X_test.drop(labels=s_cols, axis=1, inplace=True)

X_train = X_train.merge(X_train_s, how='left', left_index=True, right_index=True)
X_test = X_test.merge(X_test_s, how='left', left_index=True, right_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [51]:
make_pickle((X_train, X_test, y_train, y_test), '../data/processed/train_test.pkl')