In [1]:
#setup

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [2]:
#recombine data

from combine import recombine
data = recombine("../data/potholes_1.csv", "../data/potholes_2.csv")

In [3]:
#extract days from date data by converting date to time, creating completion time variable, and turning to an int

data["creation_date"] = pd.to_datetime(data["CREATION DATE"])
data["completion_date"] = pd.to_datetime(data["COMPLETION DATE"])
data["creation_year"] = pd.DatetimeIndex(data["creation_date"]).year
data["creation_date"] = pd.to_datetime(data["creation_date"], format = "%m/%d/%Y")
data["completion_date"] = pd.to_datetime(data["completion_date"], format = "%m/%d/%Y")
data["completion_time"] = data["completion_date"] - data["creation_date"]
data["completion_time"] = data["completion_time"].apply(lambda x: x.days)

In [4]:
#inefficiently clean data

data.drop('CREATION DATE', axis=1, inplace=True)
data.drop('COMPLETION DATE', axis=1, inplace=True)
data.drop('STATUS', axis=1, inplace=True)
data.drop('SERVICE REQUEST NUMBER', axis=1, inplace=True)
data.drop('TYPE OF SERVICE REQUEST', axis=1, inplace=True)
data.drop('CURRENT ACTIVITY', axis=1, inplace=True)
data.drop('MOST RECENT ACTION', axis=1, inplace=True)
data.drop('STREET ADDRESS', axis=1, inplace=True)
data.drop('X COORDINATE', axis=1, inplace=True)
data.drop('Y COORDINATE', axis=1, inplace=True)
data.drop('SSA', axis=1, inplace=True)
data.drop('LATITUDE', axis=1, inplace=True)
data.drop('LONGITUDE', axis=1, inplace=True)
data.drop('LOCATION', axis=1, inplace=True)
data.drop('creation_date', axis=1, inplace=True)
data.drop('completion_date', axis=1, inplace=True)
data.drop('ZIP', axis=1, inplace=True)
data.drop('Ward', axis=1, inplace=True)
data.drop('Police District', axis=1, inplace=True)
data.drop('NUMBER OF POTHOLES FILLED ON BLOCK', axis=1, inplace=True)
data = data.dropna()
data["community_area"] = data["Community Area"].astype(int)
data.drop('Community Area', axis=1, inplace=True)

In [5]:
#make a list of total pothole count per community area

potc = data.groupby(["community_area"])["creation_year"].count().reset_index(name="count")
potc = potc.drop(labels=0, axis=0)
Ya = potc["count"]
Ya.reset_index(drop=True)

0      9960
1     15420
2      5415
3      7069
4      6335
      ...  
72     7627
73     5097
74     6693
75     2278
76     8928
Name: count, Length: 77, dtype: int64

In [6]:
#make a list of average service request completion time by community area

compt = data.groupby(["community_area"]).agg({'completion_time': 'mean'})
compt = compt.drop(labels=0, axis=0)
Yb = compt

In [7]:
#import crime data and make a list of average yearly crime by community area

crimed = pd.read_csv("../data/Crimes_data.csv")
crimed["community_area"] = crimed["Community Area"]
crimed["crime_rate"] = crimed["ID"]
crimes = crimed[['community_area', "crime_rate"]]
crimes.dropna()
crime = crimes.groupby(["community_area"]).count()
crime["crime_rate"] = crime["crime_rate"] / 5

In [8]:
#import per capita yearly income from census data by community area

incomes = pd.read_csv("../data/chicago_census_data.csv")
incomes = incomes.drop(labels=77, axis=0)
incomes["avg_income"] = incomes["PER CAPITA INCOME "]
incomes["community_area"] = incomes["Community Area Number"].astype(int)
income = incomes[['community_area', "avg_income"]]

In [9]:
#create an X matric for the regression with avg yearly income and crime rate

X = pd.merge(income, crime, on = 'community_area')
X.drop('community_area', axis=1, inplace=True)
X.index += 1

In [10]:
#add the constant term to the matrix

Xc = sm.add_constant(X)

In [11]:
#regression of total potholes by community area on crime rate and yearly income

esta = sm.OLS(Ya, Xc)
est1 = esta.fit()
est1.summary()

0,1,2,3
Dep. Variable:,count,R-squared:,0.255
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,12.69
Date:,"Mon, 21 Mar 2022",Prob (F-statistic):,1.83e-05
Time:,18:28:49,Log-Likelihood:,-738.35
No. Observations:,77,AIC:,1483.0
Df Residuals:,74,BIC:,1490.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3355.4005,923.551,3.633,0.001,1515.186,5195.615
avg_income,0.0868,0.027,3.188,0.002,0.033,0.141
crime_rate,2.1707,0.512,4.244,0.000,1.151,3.190

0,1,2,3
Omnibus:,4.793,Durbin-Watson:,1.674
Prob(Omnibus):,0.091,Jarque-Bera (JB):,4.681
Skew:,0.6,Prob(JB):,0.0963
Kurtosis:,2.868,Cond. No.,66900.0


In [12]:
#regression of average pothole service request completion time by community area on crime rate and yearly income

estb = sm.OLS(Yb, Xc)
est2 = estb.fit()
est2.summary()

0,1,2,3
Dep. Variable:,completion_time,R-squared:,0.021
Model:,OLS,Adj. R-squared:,-0.006
Method:,Least Squares,F-statistic:,0.787
Date:,"Mon, 21 Mar 2022",Prob (F-statistic):,0.459
Time:,18:28:49,Log-Likelihood:,-287.66
No. Observations:,77,AIC:,581.3
Df Residuals:,74,BIC:,588.4
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,26.4048,2.652,9.957,0.000,21.121,31.689
avg_income,7.767e-05,7.81e-05,0.994,0.324,-7.8e-05,0.000
crime_rate,0.0013,0.001,0.876,0.384,-0.002,0.004

0,1,2,3
Omnibus:,8.612,Durbin-Watson:,0.652
Prob(Omnibus):,0.013,Jarque-Bera (JB):,5.421
Skew:,0.488,Prob(JB):,0.0665
Kurtosis:,2.143,Cond. No.,66900.0
