In [1]:
#setup

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [2]:
#recombine data

from combine import recombine
data = recombine("../data/potholes_1.csv", "../data/potholes_2.csv")

In [3]:
#extract days from date data by converting date to time, creating completion time variable, and turning to an int

data["creation_date"] = pd.to_datetime(data["CREATION DATE"])
data["completion_date"] = pd.to_datetime(data["COMPLETION DATE"])
data["creation_year"] = pd.DatetimeIndex(data["creation_date"]).year
data["creation_date"] = pd.to_datetime(data["creation_date"], format = "%m/%d/%Y")
data["completion_date"] = pd.to_datetime(data["completion_date"], format = "%m/%d/%Y")
data["completion_time"] = data["completion_date"] - data["creation_date"]
data["completion_time"] = data["completion_time"].apply(lambda x: x.days)

In [4]:
#inefficiently clean data

data.drop('CREATION DATE', axis=1, inplace=True)
data.drop('COMPLETION DATE', axis=1, inplace=True)
data.drop('STATUS', axis=1, inplace=True)
data.drop('SERVICE REQUEST NUMBER', axis=1, inplace=True)
data.drop('TYPE OF SERVICE REQUEST', axis=1, inplace=True)
data.drop('CURRENT ACTIVITY', axis=1, inplace=True)
data.drop('MOST RECENT ACTION', axis=1, inplace=True)
data.drop('STREET ADDRESS', axis=1, inplace=True)
data.drop('X COORDINATE', axis=1, inplace=True)
data.drop('Y COORDINATE', axis=1, inplace=True)
data.drop('SSA', axis=1, inplace=True)
data.drop('LATITUDE', axis=1, inplace=True)
data.drop('LONGITUDE', axis=1, inplace=True)
data.drop('LOCATION', axis=1, inplace=True)
data.drop('creation_date', axis=1, inplace=True)
data.drop('completion_date', axis=1, inplace=True)
data.drop('ZIP', axis=1, inplace=True)
data.drop('Ward', axis=1, inplace=True)
data.drop('Police District', axis=1, inplace=True)
data.drop('NUMBER OF POTHOLES FILLED ON BLOCK', axis=1, inplace=True)
data = data.dropna()
data["community_area"] = data["Community Area"].astype(int)
data.drop('Community Area', axis=1, inplace=True)

In [5]:
#make a list of total pothole count per community area

potc = data.groupby(["community_area"])["creation_year"].count().reset_index(name="count")
potc = potc.drop(labels=0, axis=0)
Ya = potc["count"]
Ya.reset_index(drop=True)

0      9960
1     15420
2      5415
3      7069
4      6335
      ...  
72     7627
73     5097
74     6693
75     2278
76     8928
Name: count, Length: 77, dtype: int64

In [6]:
#make a list of average service request completion time by community area

compt = data.groupby(["community_area"]).agg({'completion_time': 'mean'})
compt = compt.drop(labels=0, axis=0)
Yb = compt

In [7]:
#import crime data and make a list of average yearly crime by community area

crimed = pd.read_csv("../data/Crimes_data.csv")
crimed["community_area"] = crimed["Community Area"]
crimed["crime_rate"] = crimed["ID"]
crimes = crimed[['community_area', "crime_rate"]]
crimes.dropna()
crime = crimes.groupby(["community_area"]).count()
crime["crime_rate"] = crime["crime_rate"] / 5

In [8]:
#import per capita yearly income from census data by community area

incomes = pd.read_csv("../data/chicago_census_data.csv")
incomes = incomes.drop(labels=77, axis=0)
incomes["avg_income"] = incomes["PER CAPITA INCOME "]
incomes["below_poverty"] = incomes["PERCENT HOUSEHOLDS BELOW POVERTY"]
incomes["community_area"] = incomes["Community Area Number"].astype(int)
income = incomes[['community_area', "avg_income", "below_poverty"]]

In [9]:
#import population by community area data

pops = pd.read_csv("../data/population.csv")
pops["population"] = pops["Total Population"]
pops["community_area"] = pops["GeogKey"]
pop = pops[["community_area", "population"]]

In [10]:
#create an X matric for the regression with avg yearly income, population, and crime rate

Xt = pd.merge(income, crime, on = 'community_area')
X = pd.merge(Xt, pop, on = 'community_area')
X.drop('community_area', axis=1, inplace=True)
X.index += 1

In [11]:
#add the constant term to the matrix

Xc = sm.add_constant(X)

In [12]:
#regression of total potholes by community area on crime rate, population, and yearly income

esta = sm.OLS(Ya, Xc)
est1 = esta.fit()
est1.summary()

0,1,2,3
Dep. Variable:,count,R-squared:,0.821
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,82.48
Date:,"Mon, 21 Mar 2022",Prob (F-statistic):,3.98e-26
Time:,19:20:50,Log-Likelihood:,-683.49
No. Observations:,77,AIC:,1377.0
Df Residuals:,72,BIC:,1389.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5109.2644,895.312,5.707,0.000,3324.493,6894.036
avg_income,-0.0298,0.017,-1.781,0.079,-0.063,0.004
below_poverty,-126.2861,25.215,-5.008,0.000,-176.552,-76.020
crime_rate,0.9123,0.344,2.649,0.010,0.226,1.599
population,0.1398,0.012,12.034,0.000,0.117,0.163

0,1,2,3
Omnibus:,2.491,Durbin-Watson:,2.171
Prob(Omnibus):,0.288,Jarque-Bera (JB):,1.792
Skew:,0.334,Prob(JB):,0.408
Kurtosis:,3.335,Cond. No.,214000.0


In [13]:
#regression of average pothole service request completion time by community area on crime rate and yearly income

estb = sm.OLS(Yb, Xc)
est2 = estb.fit()
est2.summary()

0,1,2,3
Dep. Variable:,completion_time,R-squared:,0.132
Model:,OLS,Adj. R-squared:,0.084
Method:,Least Squares,F-statistic:,2.74
Date:,"Mon, 21 Mar 2022",Prob (F-statistic):,0.0351
Time:,19:20:50,Log-Likelihood:,-283.02
No. Observations:,77,AIC:,576.0
Df Residuals:,72,BIC:,587.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,28.4508,4.935,5.766,0.000,18.614,38.288
avg_income,-5.257e-05,9.22e-05,-0.570,0.570,-0.000,0.000
below_poverty,-0.1433,0.139,-1.031,0.306,-0.420,0.134
crime_rate,-8.328e-05,0.002,-0.044,0.965,-0.004,0.004
population,0.0002,6.4e-05,2.411,0.018,2.67e-05,0.000

0,1,2,3
Omnibus:,6.819,Durbin-Watson:,0.95
Prob(Omnibus):,0.033,Jarque-Bera (JB):,4.577
Skew:,0.445,Prob(JB):,0.101
Kurtosis:,2.203,Cond. No.,214000.0
