In [1]:
%load_ext lab_black
import os
from combine import recombine
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
# reading the data

data = recombine("../data/potholes_1.csv", "../data/potholes_2.csv")

In [None]:
# extract days from date data by converting date to time, creating completion time variable, and turning to an int

data = data.drop_duplicates(subset=["SERVICE REQUEST NUMBER"])
data["creation_date"] = pd.to_datetime(data["CREATION DATE"])
data["completion_date"] = pd.to_datetime(data["COMPLETION DATE"])
data["creation_year"] = pd.DatetimeIndex(data["creation_date"]).year
data["creation_date"] = pd.to_datetime(data["creation_date"], format="%m/%d/%Y")
data["completion_date"] = pd.to_datetime(data["completion_date"], format="%m/%d/%Y")
data["completion_time"] = data["completion_date"] - data["creation_date"]
data["completion_time"] = data["completion_time"].apply(lambda x: x.days)

In [None]:
# inefficiently clean data

data.drop("CREATION DATE", axis=1, inplace=True)
data.drop("COMPLETION DATE", axis=1, inplace=True)
data.drop("STATUS", axis=1, inplace=True)
data.drop("SERVICE REQUEST NUMBER", axis=1, inplace=True)
data.drop("TYPE OF SERVICE REQUEST", axis=1, inplace=True)
data.drop("CURRENT ACTIVITY", axis=1, inplace=True)
data.drop("MOST RECENT ACTION", axis=1, inplace=True)
data.drop("STREET ADDRESS", axis=1, inplace=True)
data.drop("X COORDINATE", axis=1, inplace=True)
data.drop("Y COORDINATE", axis=1, inplace=True)
data.drop("SSA", axis=1, inplace=True)
data.drop("LATITUDE", axis=1, inplace=True)
data.drop("LONGITUDE", axis=1, inplace=True)
data.drop("LOCATION", axis=1, inplace=True)
data.drop("creation_date", axis=1, inplace=True)
data.drop("completion_date", axis=1, inplace=True)
data.drop("ZIP", axis=1, inplace=True)
data.drop("Ward", axis=1, inplace=True)
data.drop("Police District", axis=1, inplace=True)
data.drop("NUMBER OF POTHOLES FILLED ON BLOCK", axis=1, inplace=True)
data = data.dropna()
data["community_area"] = data["Community Area"].astype(int)
data.drop("Community Area", axis=1, inplace=True)

In [None]:
# make a list of total pothole count per community area

potc = (
    data.groupby(["community_area"])["creation_year"].count().reset_index(name="count")
)
potc = potc.drop(labels=0, axis=0)
Ytemp = potc["count"]
Ytemp.reset_index(drop=True)

In [None]:
# make a list of average service request completion time by community area

compt = data.groupby(["community_area"]).agg({"completion_time": "mean"})
compt = compt.drop(labels=0, axis=0)
Yb = compt

In [None]:
# import surface area in Km^2 of every community area

areas = pd.read_csv("../data/surfaceareas.csv")
areas["community_area"] = areas["Area Number"]
areas["area"] = areas["Area"]
areas.drop("Area Number", axis=1, inplace=True)
areas.drop("Area", axis=1, inplace=True)

In [None]:
# import crime data and make a list of average yearly crime by community area

crimed = pd.read_csv("../data/Crimes_data.csv")
crimed["community_area"] = crimed["Community Area"]
crimed["crime_rate"] = crimed["ID"]
crimes = crimed[["community_area", "crime_rate"]]
crimes.dropna()
crime = crimes.groupby(["community_area"]).count()
crime["crime_rate"] = crime["crime_rate"] / 5

In [None]:
# import per capita yearly income from census data by community area

incomes = pd.read_csv("../data/chicago_census_data.csv")
incomes = incomes.drop(labels=77, axis=0)
incomes["avg_income"] = incomes["PER CAPITA INCOME "]
incomes["below_poverty"] = incomes["PERCENT HOUSEHOLDS BELOW POVERTY"]
incomes["community_area"] = incomes["Community Area Number"].astype(int)
income = incomes[["community_area", "avg_income", "below_poverty"]]

In [None]:
# import population by community area data

pops = pd.read_csv("../data/population.csv")
pops["population"] = pops["Total Population"]
pops["community_area"] = pops["GeogKey"]
pop = pops[["community_area", "population"]]

In [None]:
# create population density for every community area (in people/km^2)

temp = pd.merge(pop, areas, on="community_area")
temp["popdensity"] = temp["population"] / temp["area"]
temp.drop("area", axis=1, inplace=True)
temp.drop("population", axis=1, inplace=True)
popdens = temp

In [None]:
# create pothole density for every community area (in potholes/km^2)

t = Ytemp.to_frame().reset_index()
t["community_area"] = t["index"]
t.drop("index", axis=1, inplace=True)
temp2 = pd.merge(t, areas, on="community_area")
temp2["density"] = temp2["count"] / temp2["area"]
temp2.drop("area", axis=1, inplace=True)
temp2.drop("count", axis=1, inplace=True)
Ya = temp2["density"]
Ya.index += 1

In [None]:
# create an X matric for the regression with avg yearly income, population density, and crime rate

Xt = pd.merge(income, crime, on="community_area")
X = pd.merge(Xt, popdens, on="community_area")
X.drop("community_area", axis=1, inplace=True)
X.index += 1

In [None]:
# add the constant term to the matrix

Xc = sm.add_constant(X)

In [None]:
# regression of pothole density by community area on crime rate, population density, yearly income, and below poverty line population

esta = sm.OLS(Ya, Xc)
est1 = esta.fit()
est1.summary()

In [None]:
# regression of average service request completion time by community area on crime rate, population density,
# yearly income, and below poverty line population

estb = sm.OLS(Yb, Xc)
est2 = estb.fit()
est2.summary()

In [None]:
# exporting pictures of the first regression summary

plt.rc("figure", figsize=(4, 3), dpi=900)
plt.text(0.01, 0.05, str(est1.summary()), {"fontsize": 9}, fontproperties="monospace")
plt.axis("off")
os.makedirs("../artifacts", exist_ok=True)
plt.savefig("../artifacts/reg1.png")

In [None]:
# exporting pictures of the second regression summary

plt.rc("figure", figsize=(4, 3), dpi=900)
plt.text(0.01, 0.05, str(est2.summary()), {"fontsize": 9}, fontproperties="monospace")
plt.axis("off")
plt.savefig("../artifacts/reg2.png")