# 7 Linear Regression Models

4/27/22

This notebook uses the final processed data set, consisting of the California Enviroscans 1 through 4, and the warehouse business census counts for each zip code.

- [Import libraries](#Import-libraries)
- [Import and examine merged CAES and business count data.](#Import-and-examine-merged-CAES-and-business-count-data.)


## Initial setup

### Import libraries

In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns

## Import and examine merged CAES and business count data.

In [2]:
#import the merged CAESes 1-4 and business-counts-by-zip data 
all_merged_filepath = '../processed_data/all_merged.csv'
#amgd -- all merged dataframe
amgd = pd.read_csv(all_merged_filepath)
amgd.head()

Unnamed: 0,zip,total population,ozone,ozone pctl,pm2.5,pm2.5 pctl,diesel pm,diesel pm pctl,pesticides,pesticides pctl,...,cardiovascular disease,cardiovascular disease pctl,housing burden,housing burden pctl,census_year,est total,est gen,est cold,est farm,est other
0,93706,41087,0.177311,76.55,15.61,95.96,6.14,56.19,1120.3,90.59,...,,,,,2012,4,4,0,0,0
1,93307,82658,0.512336,93.7,19.4,99.69,9.57,64.73,7128.7,98.37,...,,,,,2012,1,0,0,1,0
2,95205,38069,0.02279,38.53,12.51,71.65,18.95,80.5,94.2,72.24,...,,,,,2012,1,0,0,0,1
3,93702,48607,0.317831,86.99,16.08,97.06,36.85,94.91,7.92,53.79,...,,,,,2012,0,0,0,0,0
4,90058,3223,0.015613,31.32,15.26,94.98,56.8,98.36,0.0,0.0,...,,,,,2012,35,12,19,0,4


In [3]:
#look at the data included with the imported file.
amgd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25444 entries, 0 to 25443
Data columns (total 63 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   zip                          25444 non-null  int64  
 1   total population             25444 non-null  int64  
 2   ozone                        25333 non-null  float64
 3   ozone pctl                   25333 non-null  float64
 4   pm2.5                        25286 non-null  float64
 5   pm2.5 pctl                   25286 non-null  float64
 6   diesel pm                    25444 non-null  float64
 7   diesel pm pctl               25444 non-null  float64
 8   pesticides                   25444 non-null  float64
 9   pesticides pctl              25444 non-null  float64
 10  traffic                      25353 non-null  float64
 11  traffic pctl                 25353 non-null  float64
 12  cleanup sites                25444 non-null  float64
 13  cleanup sites pc

In [4]:
# look at basic statistics of the numerical types.
# ces-per-range, county, and city are non-numerical and excluded here
# the 'zip' statistics have little meaning --- they are just region labels.
amgd.describe()

Unnamed: 0,zip,total population,ozone,ozone pctl,pm2.5,pm2.5 pctl,diesel pm,diesel pm pctl,pesticides,pesticides pctl,...,cardiovascular disease,cardiovascular disease pctl,housing burden,housing burden pctl,census_year,est total,est gen,est cold,est farm,est other
count,25444.0,25444.0,25333.0,25333.0,25286.0,25286.0,25444.0,25444.0,25444.0,25444.0,...,16059.0,16059.0,15768.0,15768.0,25444.0,25444.0,25444.0,25444.0,25444.0,25444.0
mean,92864.446156,5917.335796,0.068468,47.320326,10.221574,51.367337,12.497526,50.444645,296.028216,18.939449,...,10.837837,50.011075,18.908923,50.184404,2013.578958,1.959047,1.468716,0.181222,0.02873,0.28038
std,3380.8974,7251.318591,0.112443,30.957227,2.503114,29.032522,16.249286,28.787606,2555.276827,30.078159,...,4.95345,28.891888,8.509076,28.835873,0.590788,5.277823,4.470229,0.757449,0.182993,0.718911
min,32.0,0.0,0.0,0.0,1.651081,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.8,0.012674,2012.0,0.0,0.0,0.0,0.0,0.0
25%,91606.0,3423.75,0.03064,20.65,8.559432,27.690338,0.000898,25.67206,0.0,0.0,...,7.265,25.0,12.6,25.323194,2013.0,0.0,0.0,0.0,0.0,0.0
50%,92703.0,4556.0,0.045884,48.450529,10.37,52.61,7.65,50.667918,0.0,0.0,...,9.76,50.024925,17.6,50.26616,2014.0,0.0,0.0,0.0,0.0,0.0
75%,94564.0,5946.0,0.059387,73.93,12.05,77.977844,19.76,75.4,0.488285,34.11,...,13.32,75.07,23.9,75.107731,2014.0,2.0,1.0,0.0,0.0,0.0
max,96161.0,105549.0,1.285954,100.0,21.2,100.0,253.731,100.0,91316.19,100.0,...,40.85,100.0,78.2,100.0,2014.0,96.0,86.0,19.0,3.0,9.0


### Remove percentile columns.

### Extract numeric columns for linear regressions.

In [None]:
numeric_cols = 

### Inpute missing values.

In [10]:
amgd.fillna(amgd.median())

  amgd.fillna(amgd.median())


Unnamed: 0,zip,total population,ozone,ozone pctl,pm2.5,pm2.5 pctl,diesel pm,diesel pm pctl,pesticides,pesticides pctl,...,cardiovascular disease,cardiovascular disease pctl,housing burden,housing burden pctl,census_year,est total,est gen,est cold,est farm,est other
0,93706,41087,0.177311,76.550000,15.610000,95.960000,6.140000,56.190000,1120.30,90.59,...,9.76,50.024925,17.6,50.26616,2012,4,4,0,0,0
1,93307,82658,0.512336,93.700000,19.400000,99.690000,9.570000,64.730000,7128.70,98.37,...,9.76,50.024925,17.6,50.26616,2012,1,0,0,1,0
2,95205,38069,0.022790,38.530000,12.510000,71.650000,18.950000,80.500000,94.20,72.24,...,9.76,50.024925,17.6,50.26616,2012,1,0,0,0,1
3,93702,48607,0.317831,86.990000,16.080000,97.060000,36.850000,94.910000,7.92,53.79,...,9.76,50.024925,17.6,50.26616,2012,0,0,0,0,0
4,90058,3223,0.015613,31.320000,15.260000,94.980000,56.800000,98.360000,0.00,0.00,...,9.76,50.024925,17.6,50.26616,2012,35,12,19,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25439,92140,3760,0.042599,29.894213,10.270812,50.914748,0.000952,80.883634,0.00,0.00,...,7.98,13.659023,17.6,50.26616,2014,0,0,0,0,0
25440,92135,2467,0.042599,29.894213,10.463289,52.283759,0.000576,63.932794,0.00,0.00,...,3.73,0.024925,17.6,50.26616,2014,0,0,0,0,0
25441,94128,0,0.034190,10.566273,8.789373,32.619788,0.002846,98.382078,0.00,0.00,...,9.76,50.024925,17.6,50.26616,2014,0,0,0,0,0
25442,57,10,0.039421,17.672682,7.052988,6.994400,0.000007,1.008090,0.00,0.00,...,8.59,17.896311,17.6,50.26616,2014,0,0,0,0,0


In [9]:
amgd.median()

  amgd.median()


zip                            9.270300e+04
total population               4.556000e+03
ozone                          4.588368e-02
ozone pctl                     4.845053e+01
pm2.5                          1.037000e+01
pm2.5 pctl                     5.261000e+01
diesel pm                      7.650000e+00
diesel pm pctl                 5.066792e+01
pesticides                     0.000000e+00
pesticides pctl                0.000000e+00
traffic                        8.143707e+02
traffic pctl                   5.060000e+01
cleanup sites                  2.250000e+00
cleanup sites pctl             2.749160e+01
groundwater threats            7.600000e+00
groundwater threats pctl       3.638000e+01
haz. waste                     1.300000e-01
haz. waste pctl                3.919000e+01
imp. water bodies              1.000000e+00
imp. water bodies pctl         1.526000e+01
solid waste                    2.000000e-01
solid waste pctl               3.620000e+00
pollution burden               4

### Assign a label for target columns

These columns are the primary targets later in this notebook.

In [7]:
health_cols = ['asthma', #a
               'low birth weight', #b
              'cardiovascular disease' #c
              ]

## Linear regressions on time and space features.

The analysis later on primarily ignores these features. There are a general general relationships we can draw by looking at them.

In [8]:
time_space_cols = ['year',
                  'latitude',
                  'longitude'
                  ]

In [None]:
X = amgd

In [None]:
lr_time_space_a = LinearRegression()
lr_time_space_a.fit