In [3]:
%matplotlib
import gmaps
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import os
import pandas as pd
pd.options.display.max_columns = None
import requests
import time
from scipy.stats import linregress, pearsonr
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
#from sklearn.decomposition import FactorAnalysis

Using matplotlib backend: Qt5Agg


In [4]:
#Accessing full neighborhood dataset, which was derived from two data sets (Neighborhood_Health.csv and Reported_Crimes.csv) and neighborhood_crime.ipynb

z = pd.read_csv('output_data/neighborhood_full_data.csv')

# Index(['Community Area Name', 'Community Area', 'Birth Rate',
#        'General Fertility Rate', 'Low Birth Weight',
#        'Prenatal Care Beginning in First Trimester', 'Preterm Births',
#        'Teen Birth Rate', 'Homicide_rate_per_100k', 'Breast cancer in females',
#        'Cancer (All Sites)', 'Colorectal Cancer', 'Diabetes-related',
#        'Firearm-related', 'Infant Mortality Rate', 'Lung Cancer',
#        'Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)',
#        'Childhood Blood Lead Level Screening', 'Childhood Lead Poisoning',
#        'Gonorrhea in Females', 'Tuberculosis', 'Below Poverty Level',
#        'Crowded Housing', 'Dependency', 'No High School Diploma',
#        'Per Capita Income', 'Unemployment', 'ASSAULT', 'DECEPTIVE PRACTICE',
#        'OFFENSE INVOLVING CHILDREN', 'CRIM SEXUAL ASSAULT', 'CRIMINAL DAMAGE',
#        'CRIMINAL TRESPASS', 'WEAPONS VIOLATION', 'MOTOR VEHICLE THEFT',
#        'NARCOTICS', 'THEFT', 'BATTERY', 'OTHER OFFENSE', 'ROBBERY',
#        'PUBLIC PEACE VIOLATION', 'SEX OFFENSE', 'PROSTITUTION', 'BURGLARY',
#        'STALKING', 'CRIMINAL SEXUAL ASSAULT', 'ARSON', 'Homicide', 'GAMBLING',
#        'KIDNAPPING', 'INTERFERENCE WITH PUBLIC OFFICER', 'INTIMIDATION',
#        'LIQUOR LAW VIOLATION', 'OBSCENITY', 'PUBLIC INDECENCY',
#        'OTHER NARCOTIC VIOLATION', 'NON-CRIMINAL', 'HUMAN TRAFFICKING',
#        'RITUALISM', 'Violence', 'Sexual Crimes', 'Property Crimes', 'Drugs'],
#       dtype='object')

# 77 rows × 64 columns

In [5]:
# Variable Units

# Per Capita Income, Birth Rate per (1k),
# General Fertility Rate per (1k),% Low Birth Weight,% Prenatal Care in First Trimester,
# % Preterm Births,Teen Birth Rate per (1k),Homicides per (100k),Breast Cancer Females per (100k),
# Cancer (All Sites) per (100k),Colorectal Cancer per (100k),Diabetes-related per (100k),Firearm-related per (100k),
# Infant Mortality Rate per (1k),Lung Cancer per (100k),Prostate Cancer in Males per (100k),
# Stroke (Cerebrovascular Disease) per (100k),Childhood Blood Lead Level Screening per (1k),
# Childhood Lead Poisoning per (100),Gonorrhea in Females per (100k),Gonorrhea in Males per (100k),
# Tuberculosis per (100k),% Below Poverty Level,% Crowded Housing,% Dependency,% No High School Diploma,Unemployment

In [6]:
# Removing any crime that is not homicide, violence, or property crimes from dataframe for consistency with state-level analysis

z = z[['Community Area Name', 'Homicide', 'Homicide_rate_per_100k', 'Violence', 'Property Crimes', 'Birth Rate',
       'General Fertility Rate', 'Low Birth Weight',
       'Prenatal Care Beginning in First Trimester', 'Preterm Births',
       'Teen Birth Rate', 'Breast cancer in females',
       'Cancer (All Sites)', 'Colorectal Cancer', 'Diabetes-related',
       'Firearm-related', 'Infant Mortality Rate', 'Lung Cancer',
       'Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)',
       'Childhood Blood Lead Level Screening', 'Childhood Lead Poisoning',
       'Gonorrhea in Females', 'Tuberculosis', 'Below Poverty Level',
       'Crowded Housing', 'Dependency', 'No High School Diploma',
       'Per Capita Income', 'Unemployment']].reset_index(drop=True)

# 77 rows × 30 columns

In [7]:
# Selecting variables that correlate highly (r >= 0.3) with homicide
#Categorizing variables: crime factors, disease factors, birth factors, and economic factors

"""
Variables are selected for further analysis if over a 0.3 pearson correlation with any crime factor
(i.e., 'Homicide', 'Violence', or 'Property Crimes') and statistically significant as indicated by
a two-tailed t-test (p < 0.5).
"""

# Crime Factors
ys = ['Homicide', 'Violence', 'Property Crimes']

# Disease Factors
print('Disease Factors')
xsd = ['Breast cancer in females',
       'Cancer (All Sites)', 'Colorectal Cancer', 'Diabetes-related', 'Lung Cancer',\
       'Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)', 'Tuberculosis']
disease_factors = []
for y in ys:
    for x in xsd:
        coefficient, p_value = pearsonr(z[x], z[y])
        if coefficient >= 0.3 and p_value <= .05:
            print(f'x={y}, y={x}, r={coefficient}, p={p_value}')
            if x not in disease_factors:
                disease_factors.append(x)
print()
print(disease_factors)
print()

#Birth Factors
print('Birth Factors')
xsb = ['Birth Rate','General Fertility Rate', 'Low Birth Weight', 'Prenatal Care Beginning in First Trimester', 'Preterm Births','Teen Birth Rate']
birth_problems = []
for y in ys:
    for x in xsb:
        coefficient, p_value = pearsonr(z[x], z[y])
        if coefficient >= 0.3 and p_value <= .05:
            print(f'x={y}, y={x}, r={coefficient}, p={p_value}')
            if x not in birth_problems:
                birth_problems.append(x)
print()
print(birth_problems)
print()

#Economic Factors
print('Economic Factors')
xse = ['Below Poverty Level', 'Crowded Housing', 'Dependency', 'No High School Diploma', 'Per Capita Income', 'Unemployment']
economic_factors = []
for y in ys:
    for x in xse:
        coefficient, p_value = pearsonr(z[x], z[y])
        if coefficient >= 0.3 and p_value <= .05:
            print(f'x={y}, y={x}, r={coefficient}, p={p_value}')
            if x not in economic_factors:
                economic_factors.append(x)
print()
print(economic_factors)
print()

Disease Factors
x=Homicide, y=Breast cancer in females, r=0.404349454435313, p=0.00026513954541516465
x=Homicide, y=Cancer (All Sites), r=0.5342987330417479, p=5.593499116873891e-07
x=Homicide, y=Colorectal Cancer, r=0.3997293860331488, p=0.00031623644381903584
x=Homicide, y=Diabetes-related, r=0.5479774592764278, p=2.4922022604025977e-07
x=Homicide, y=Lung Cancer, r=0.370443643411419, p=0.000912222378386188
x=Homicide, y=Prostate Cancer in Males, r=0.5234656183753, p=1.0350732704160641e-06
x=Homicide, y=Stroke (Cerebrovascular Disease), r=0.41801644280939265, p=0.00015504848981248896
x=Violence, y=Breast cancer in females, r=0.3585250208956884, p=0.0013660005483273113
x=Violence, y=Cancer (All Sites), r=0.44730476075227643, p=4.532536917172316e-05
x=Violence, y=Colorectal Cancer, r=0.32814190285202566, p=0.0035743565377889487
x=Violence, y=Diabetes-related, r=0.47160952400069905, p=1.4945367027873454e-05
x=Violence, y=Lung Cancer, r=0.31138238679510577, p=0.005841172483964133
x=Violen

In [8]:
z.loc[:,'Homicide': 'Property Crimes', ].corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Homicide,Homicide_rate_per_100k,Violence,Property Crimes
Homicide,1.0,0.633506,0.948805,0.63977
Homicide_rate_per_100k,0.633506,1.0,0.512642,0.16388
Violence,0.948805,0.512642,1.0,0.800967
Property Crimes,0.63977,0.16388,0.800967,1.0


In [10]:
#Creating strong correlates csv file

strong_correlates = z[['Community Area Name', 'Homicide_rate_per_100k'] + ys + disease_factors + birth_problems + economic_factors]
strong_correlates.to_csv('output_data/analysis_data.csv', index=None)

# 77 rows × 19 columns

strong_correlates.describe()

Unnamed: 0,Homicide_rate_per_100k,Homicide,Violence,Property Crimes,Breast cancer in females,Cancer (All Sites),Colorectal Cancer,Diabetes-related,Lung Cancer,Prostate Cancer in Males,Stroke (Cerebrovascular Disease),Birth Rate,General Fertility Rate,Low Birth Weight,Preterm Births,Teen Birth Rate,Below Poverty Level,Dependency,Unemployment
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,18.068831,41.922078,9030.376623,13073.883117,25.951948,194.277922,21.584416,71.937662,51.481818,36.805195,46.544156,15.698701,68.394805,10.101299,11.264935,50.064935,20.292208,35.82987,13.303896
std,16.561077,48.766837,8336.806512,10438.963028,9.55759,45.652058,7.695335,21.497999,16.439928,20.595189,14.44857,3.528735,15.257465,3.913926,3.016916,28.097817,11.496988,7.269802,7.031965
min,0.0,0.0,623.0,898.0,7.6,120.1,8.6,26.8,15.9,0.0,22.0,9.4,27.7,3.5,5.0,1.3,3.1,15.5,4.2
25%,4.9,9.0,3071.0,4869.0,20.2,152.9,15.1,58.6,37.7,19.7,37.9,12.9,60.1,7.3,8.8,33.7,12.0,32.3,7.8
50%,10.8,21.0,6165.0,9661.0,24.0,189.4,21.9,73.0,50.0,32.1,43.1,15.7,68.3,8.7,10.8,49.2,18.2,38.3,11.5
75%,32.2,59.0,13119.0,17936.0,32.7,235.2,27.1,83.9,63.8,51.4,53.5,18.5,80.7,12.7,13.7,67.9,26.1,40.9,17.4
max,70.3,238.0,46833.0,46262.0,54.7,291.5,39.4,119.1,89.6,92.9,99.1,22.4,94.9,19.7,17.5,116.9,61.4,50.2,40.0


In [11]:
strong_correlates.columns

Index(['Community Area Name', 'Homicide_rate_per_100k', 'Homicide', 'Violence',
       'Property Crimes', 'Breast cancer in females', 'Cancer (All Sites)',
       'Colorectal Cancer', 'Diabetes-related', 'Lung Cancer',
       'Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)',
       'Birth Rate', 'General Fertility Rate', 'Low Birth Weight',
       'Preterm Births', 'Teen Birth Rate', 'Below Poverty Level',
       'Dependency', 'Unemployment'],
      dtype='object')

In [12]:
health_factors = strong_correlates[['Homicide_rate_per_100k', 'Breast cancer in females', 'Cancer (All Sites)',
       'Colorectal Cancer', 'Diabetes-related', 'Lung Cancer', 'Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)']]

health_factors.columns = ['Homicide (per 100k)', 'Breast Cancer', 'Cancer (All Sites)', 'Colorectal Cancer',\
                                 'Diabetes-related', 'Lung Cancer', 'Prostate Cancer', 'Stroke']

health_factors.corr().style.background_gradient()

Unnamed: 0,Homicide (per 100k),Breast Cancer,Cancer (All Sites),Colorectal Cancer,Diabetes-related,Lung Cancer,Prostate Cancer,Stroke
Homicide (per 100k),1.0,0.342735,0.749447,0.650992,0.692539,0.706495,0.738349,0.780591
Breast Cancer,0.342735,1.0,0.557138,0.455001,0.326569,0.328028,0.430277,0.303131
Cancer (All Sites),0.749447,0.557138,1.0,0.804605,0.730662,0.90009,0.825934,0.692949
Colorectal Cancer,0.650992,0.455001,0.804605,1.0,0.562055,0.722877,0.616068,0.592772
Diabetes-related,0.692539,0.326569,0.730662,0.562055,1.0,0.633578,0.670423,0.672931
Lung Cancer,0.706495,0.328028,0.90009,0.722877,0.633578,1.0,0.741645,0.719298
Prostate Cancer,0.738349,0.430277,0.825934,0.616068,0.670423,0.741645,1.0,0.641115
Stroke,0.780591,0.303131,0.692949,0.592772,0.672931,0.719298,0.641115,1.0


In [13]:
birth_factors = strong_correlates[['Homicide_rate_per_100k',
       'Birth Rate', 'General Fertility Rate', 'Low Birth Weight',
       'Preterm Births', 'Teen Birth Rate']].reset_index(drop=True)


birth_factors.rename(columns={'Homicide_rate_per_100k': 'Homicide (per 100k)'}, inplace=True)

birth_factors.corr()

Unnamed: 0,Homicide (per 100k),Birth Rate,General Fertility Rate,Low Birth Weight,Preterm Births,Teen Birth Rate
Homicide (per 100k),1.0,0.188536,0.293843,0.747493,0.742019,0.768307
Birth Rate,0.188536,1.0,0.810334,0.108179,0.004334,0.61271
General Fertility Rate,0.293843,0.810334,1.0,0.142189,0.122235,0.655528
Low Birth Weight,0.747493,0.108179,0.142189,1.0,0.8431,0.622936
Preterm Births,0.742019,0.004334,0.122235,0.8431,1.0,0.549843
Teen Birth Rate,0.768307,0.61271,0.655528,0.622936,0.549843,1.0


In [14]:
economic_factors = strong_correlates[['Homicide_rate_per_100k','Below Poverty Level','Dependency', 'Unemployment']].reset_index(drop=True)
economic_factors.rename(columns={'Homicide_rate_per_100k': 'Homicide (per 100k)'}, inplace=True)
economic_factors.corr().corr().style.background_gradient('BrBG')

Unnamed: 0,Homicide (per 100k),Below Poverty Level,Dependency,Unemployment
Homicide (per 100k),1.0,0.166107,-0.441106,0.596215
Below Poverty Level,0.166107,1.0,-0.954319,0.494276
Dependency,-0.441106,-0.954319,1.0,-0.562269
Unemployment,0.596215,0.494276,-0.562269,1.0


In [15]:
# Function for Simple Regression

# Features: plots line, provides stats, describes relationship

def regression(x, y, x_label, y_label):
    title = f'{x_label} predicting {y_label}'
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
    regress_values = x * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    plt.scatter(x, y)
    plt.plot(x,regress_values,"r-")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    title = title.replace('\n', '_').replace(' ', '_')
    plt.savefig(f'output_data/{title}.png')
    plt.show()
    plt.close()
    print(line_eq)
    print()
    r = pearsonr(x, y)[0]
    p = pearsonr(x, y)[1]
    print(f"r = {r:.2f}, p = {p:.4f}, r_squared = {rvalue**2:.2f}")
    print()
    if p > 0.05:
        print(f'There is no relationship between {x_label.lower()} and {y_label.lower()}, p > 0.05.')
    else:
        if r > 0:
            explanation = f'An increase in {x_label.lower()} is associated with an increase in {y_label.lower()}.'
            direction = 'positive'
            if r >= 0.6:
                strength = 'strong'
            elif r >= 0.3:
                strength = 'moderate'
            else:
                strength = 'weak'
        else:
            direction = 'negative'
            explanation = f'An increase in {x_label.lower()} is associated with a decrease in {y_label.lower()}.'
            if r <= -0.6:
                strength = 'strong'
            elif r <= -0.3:
                strength = 'moderate'
            else:
                strength = 'weak'       
        print(f'There is a {strength} {direction} relationship between {x_label.lower()} and {y_label.lower()}, p > 0.05.')
        print(explanation)
    print()

In [16]:
# Multiple Regression Function

# Features: Provides stats and statistical model test

def multiple_regression(xs, y): # xs is a dataframe, y is a series; prints output and returns predictions
    xs = sm.add_constant(xs) # adding a constant
    model = sm.OLS(y, xs).fit()
    predictions = model.predict(xs) 
    print_model = model.summary()
    print(print_model)
    return predictions

In [17]:
#Multiple Regression: Regressing Neighborhood Homicide on Disease Factors

xs = strong_correlates[['Cancer (All Sites)','Colorectal Cancer', 'Diabetes-related', 'Lung Cancer','Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)']]
y = strong_correlates['Homicide']

multiple_regression(xs, y)

# Analysis finds significant results, but has potential multicollinearity issues

# Prostrate Cancer and Stroke were the only signficant regression coefficients.

# N = 77

#Sig factors were lung cancer & Cancer (All Sites)

                            OLS Regression Results                            
Dep. Variable:               Homicide   R-squared:                       0.411
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     8.146
Date:                Mon, 27 Jul 2020   Prob (F-statistic):           1.09e-06
Time:                        19:57:24   Log-Likelihood:                -387.67
No. Observations:                  77   AIC:                             789.3
Df Residuals:                      70   BIC:                             805.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

0     29.227726
1     17.301870
2    -15.109483
3     56.478757
4     69.826547
        ...    
72    18.179161
73    66.694147
74    21.469027
75    51.783621
76    64.068735
Length: 77, dtype: float64

In [18]:
#Multiple Regression: Regressing Neighborhood Homicide Rate (per 100k) on Disease Factors

xs = strong_correlates[['Cancer (All Sites)','Colorectal Cancer', 'Diabetes-related', 'Lung Cancer','Prostate Cancer in Males', 'Stroke (Cerebrovascular Disease)']]
y = strong_correlates['Homicide_rate_per_100k']

multiple_regression(xs, y)

# Analysis finds significant results, but has potential multicollinearity issues

# Prostrate Cancer and Stroke were the only signficant regression coefficients.

# N = 77

#Sig factors: Prostate Cancer in Males & Stroke (Cerebrovascular Disease)

                              OLS Regression Results                              
Dep. Variable:     Homicide_rate_per_100k   R-squared:                       0.729
Model:                                OLS   Adj. R-squared:                  0.706
Method:                     Least Squares   F-statistic:                     31.41
Date:                    Mon, 27 Jul 2020   Prob (F-statistic):           5.05e-18
Time:                            19:57:29   Log-Likelihood:                -274.61
No. Observations:                      77   AIC:                             563.2
Df Residuals:                          70   BIC:                             579.6
Df Model:                               6                                         
Covariance Type:                nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------

0      7.514723
1      7.488305
2      6.923713
3     21.646169
4     30.272844
        ...    
72     9.351363
73    37.176332
74     4.647166
75     7.542766
76    31.285998
Length: 77, dtype: float64

In [19]:
#Multiple Regression: Regressing Neighboord Homicide on Birth Factors

# % of low birth rate, % of preterm births, teen birth rate per 1,000
xs = strong_correlates[['Low Birth Weight', 'Preterm Births', 'Teen Birth Rate']]
y = strong_correlates['Homicide']

multiple_regression(xs, y)

# 

# N = 77

# Sig Factor: Teen Birth Rate

                            OLS Regression Results                            
Dep. Variable:               Homicide   R-squared:                       0.540
Model:                            OLS   Adj. R-squared:                  0.521
Method:                 Least Squares   F-statistic:                     28.60
Date:                Mon, 27 Jul 2020   Prob (F-statistic):           2.45e-12
Time:                        19:57:33   Log-Likelihood:                -378.13
No. Observations:                  77   AIC:                             764.3
Df Residuals:                      73   BIC:                             773.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const              -29.3841     15.501  

0     33.655915
1     39.968258
2      8.163716
3     26.679518
4     80.843182
        ...    
72    32.079634
73    69.749244
74    16.567696
75    39.227287
76    55.132460
Length: 77, dtype: float64

In [20]:
#Multiple Regression: Regressing Neighboord Homicide Rate (per 100k) on Birth Factors

# % of low birth rate, % of preterm births, teen birth rate per 1,000
xs = strong_correlates[['Low Birth Weight', 'Preterm Births', 'Teen Birth Rate']]
y = strong_correlates['Homicide_rate_per_100k']

multiple_regression(xs, y)

# 

# N = 77

# Sig factors: Teen Birth Rate & Preterm Births

                              OLS Regression Results                              
Dep. Variable:     Homicide_rate_per_100k   R-squared:                       0.743
Model:                                OLS   Adj. R-squared:                  0.732
Method:                     Least Squares   F-statistic:                     70.30
Date:                    Mon, 27 Jul 2020   Prob (F-statistic):           1.76e-21
Time:                            19:57:36   Log-Likelihood:                -272.61
No. Observations:                      77   AIC:                             553.2
Df Residuals:                          73   BIC:                             562.6
Df Model:                               3                                         
Covariance Type:                nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------


0      9.804957
1     14.806473
2     11.013289
3     14.072313
4     33.421392
        ...    
72     9.236795
73    32.222612
74     5.403495
75    16.277939
76    32.753481
Length: 77, dtype: float64

In [21]:
#Multiple Regression: Regressing Homicide on Economic Factors

xs = strong_correlates[['Below Poverty Level', 'Dependency', 'Unemployment']]
y = strong_correlates['Homicide']

multiple_regression(xs, y)

# Sig Factor: Below Poverty Level

                            OLS Regression Results                            
Dep. Variable:               Homicide   R-squared:                       0.275
Model:                            OLS   Adj. R-squared:                  0.245
Method:                 Least Squares   F-statistic:                     9.225
Date:                Mon, 27 Jul 2020   Prob (F-statistic):           3.00e-05
Time:                        19:57:38   Log-Likelihood:                -395.68
No. Observations:                  77   AIC:                             799.4
Df Residuals:                      73   BIC:                             808.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                 -21.1735    

0     30.232872
1     35.061478
2     62.795982
3     21.552159
4     64.441589
        ...    
72    31.138163
73    55.344080
74    29.168055
75    19.932717
76    59.056878
Length: 77, dtype: float64

In [22]:
#Multiple Regression: Regressing Homicide_rate_per_100k on Economic Factors

xs = strong_correlates[['Below Poverty Level', 'Dependency', 'Unemployment']]
y = strong_correlates['Homicide_rate_per_100k']

multiple_regression(xs, y)

# Sig factor: Unemployment

                              OLS Regression Results                              
Dep. Variable:     Homicide_rate_per_100k   R-squared:                       0.681
Model:                                OLS   Adj. R-squared:                  0.668
Method:                     Least Squares   F-statistic:                     52.01
Date:                    Mon, 27 Jul 2020   Prob (F-statistic):           4.29e-18
Time:                            19:57:42   Log-Likelihood:                -280.88
No. Observations:                      77   AIC:                             569.8
Df Residuals:                          73   BIC:                             579.1
Df Model:                               3                                         
Covariance Type:                nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

0      9.877841
1     19.560501
2     19.062191
3      9.660002
4     37.069365
        ...    
72    10.917498
73    26.356720
74     9.869848
75     2.194825
76    26.059198
Length: 77, dtype: float64

In [23]:
# 3D Visualization of Multiple Regression Function
def regression_3d_visualization(dataframe, x1, x2, y, size = (10,10), x1label='X Label', x2label='Y Label', ylabel='Z Label'):
    df = dataframe[[x1, x2, y]].reset_index(drop=True)
    x1r, x2r, yr = x1, x2, y
    if ' ' in x1:
        x1r = x1.replace(' ', '')
        df.rename(columns={x1: x1r}, inplace=True)
    if ' ' in x2:
        x2r = x2.replace(' ', '')
        df.rename(columns={x2: x2r}, inplace=True)
    if ' ' in y:
        yr = y.replace(' ', '')
        df.rename(columns={y: yr}, inplace=True)
    model = smf.ols(formula=f'{yr} ~ {x1r} + {x2r}', data=df)
    results = model.fit()
    results.params

    x_dim, y_dim = np.meshgrid(np.linspace(df[x1r].min(), df[x1r].max(), 100), np.linspace(df[x2r].min(), df[x2r].max(), 100))
    xs = pd.DataFrame({x1r: x_dim.ravel(), x2r: y_dim.ravel()})
    predicted_y = results.predict(exog=xs)
    predicted_y=np.array(predicted_y)

    fig = plt.figure(figsize=size, facecolor='b')
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(df[x1r], df[x2r], df[yr], c='red', marker='o', alpha=0.5)
    ax.plot_surface(x_dim, y_dim, predicted_y.reshape(x_dim.shape), color='b', alpha=0.3)
    ax.set_xlabel(x1label)
    ax.set_ylabel(x2label)
    ax.set_zlabel(ylabel)
    plt.show()

In [24]:
#variables to study
#teen birth rate
#below poverty level
#unemployment

# 3D Visualization: Regressing Neighborhood on Teen Birth Rate and Unemployment
regression_3d_visualization(dataframe=strong_correlates, x1='Unemployment', x2='Teen Birth Rate', y='Homicide',\
                            size = (10,10), x1label='Unemployment', x2label='Teen Birth Rate (per 1k)',\
                            ylabel='Neighborhood Homicide')

In [25]:
#Multiple Regression: Regressing Homicide on Teen Birth & Economic Factors

xs = strong_correlates[['Teen Birth Rate', 'Unemployment']]
y = strong_correlates['Homicide']

multiple_regression(xs, y)


                            OLS Regression Results                            
Dep. Variable:               Homicide   R-squared:                       0.533
Model:                            OLS   Adj. R-squared:                  0.520
Method:                 Least Squares   F-statistic:                     42.17
Date:                Mon, 27 Jul 2020   Prob (F-statistic):           5.98e-13
Time:                        19:57:48   Log-Likelihood:                -378.77
No. Observations:                  77   AIC:                             763.5
Df Residuals:                      74   BIC:                             770.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -18.7532      8.620     

0     36.817293
1     41.723164
2     -3.353548
3     28.463252
4     80.729353
        ...    
72    37.650318
73    64.005773
74    17.509928
75    44.979209
76    41.017021
Length: 77, dtype: float64

In [26]:
#Predicted and Actual Homicides based on Teen Birth Rate and Unemployment

predicted_homicides = []

for index, row in strong_correlates.iterrows():
    predicted_value = -18.7532 + 1.3661*row['Teen Birth Rate'] + -0.5803*row['Unemployment']
    predicted_homicides.append(predicted_value)
    

actual_homicides = strong_correlates.Homicide.to_list()
d = pd.read_csv('data/Neighborhood_Health.csv')

In [27]:
# def actual_vs_predicted(dataframe, index_name, actual_values, predicted_values, chart_size = (8, 6)): #use lists for actual and predicted
#     indices = dataframe.set_index(index_name)
#     x_axis = np.arange(len(actual_values))
#     horizontal_categories = dataframe.index.tolist()
#     tick_locations = [x for x in x_axis]
#     plt.figure(figsize=chart_size)
#     plt.bar(x_axis, dataframe, alpha=0.75, align='center', width=0.55)
#     plt.xticks(tick_locations, horizontal_categories, rotation='vertical')
#     plt.xlim(-0.75,len(x_axis))
#     plt.ylim(0, max(dataframe) + 10)
#     plt.title('Datapoints for each Treatment')
#     plt.xlabel('Drug Regimen')
#     plt.ylabel('Number of Data Points')
#     plt.savefig('../Images/Dps_by_treatment_plt')
#     plt.show()
    
# actual_vs_predicted(strong_correlates, 'Community Area Name', actual_homicides, predicted_homicides)

In [28]:
#variables to study
#teen birth rate
#below poverty level
#unemployment

# 3D Visualization: Regressing Neighborhood Homicide Rate (per 100k)on Teen Birth Rate and Unemployment
regression_3d_visualization(dataframe=strong_correlates, x1='Unemployment', x2='Teen Birth Rate', y='Homicide_rate_per_100k',\
                            size = (10,10), x1label='Unemployment', x2label='Teen Birth Rate (per 1k)',\
                            ylabel='Neighborhood Homicide_rate_per_100k')

In [29]:
#Multiple Regression: Regressing Homicide_rate_per_100k on Teen Birth & Economic Factors
xs = strong_correlates[['Teen Birth Rate', 'Unemployment']]
y = strong_correlates['Homicide_rate_per_100k']

multiple_regression(xs, y)


                              OLS Regression Results                              
Dep. Variable:     Homicide_rate_per_100k   R-squared:                       0.735
Model:                                OLS   Adj. R-squared:                  0.728
Method:                     Least Squares   F-statistic:                     102.8
Date:                    Mon, 27 Jul 2020   Prob (F-statistic):           4.37e-22
Time:                            19:57:57   Log-Likelihood:                -273.72
No. Observations:                      77   AIC:                             553.4
Df Residuals:                          74   BIC:                             560.5
Df Model:                               2                                         
Covariance Type:                nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
co

0     11.313539
1     19.267911
2      8.283757
3      9.665290
4     39.426124
        ...    
72     9.800967
73    26.779837
74     6.627526
75     8.531502
76    23.413070
Length: 77, dtype: float64

In [30]:
#Example prediction equation
t, e = 50.064935 + (2*28.097817), 13.303896 + (2*7.031965)
Neighborhood_Homicide_Rate_Per_100k = -10.1931 + 0.2246*t + 1.2792*e
print(Neighborhood_Homicide_Rate_Per_100k)
#18.068831 + (2*16.561077) = 51.190985
#bar chart (each community will have two bar: one bar is predicted value and other bar is actual)

48.6819468166


In [31]:
#Predicted and Actual Homicides based on Teen Birth Rate and Unemployment

predicted_homicide_rate = []

for index, row in strong_correlates.iterrows():
    predicted_value = -10.1931 + 0.2246*row['Teen Birth Rate'] + 1.2792*row['Unemployment']
    predicted_homicide_rate.append(predicted_value)
    

actual_homicide_rate = strong_correlates['Homicide_rate_per_100k'].to_list()
d = pd.read_csv('data/Neighborhood_Health.csv')

In [33]:
#predicted_homicide_rate
#actual_homicide_rate 
N = len(predicted_homicide_rate) #77
ind = np.arange(N) 
width = 0.35
plt.bar(ind, predicted_homicide_rate, width, label='Predicted (per 100k)')
plt.bar(ind + width, actual_homicide_rate, width, label='Observed (per 100k)')

plt.ylabel('Homidicide Rates')
plt.xlabel('Neighborhoods')
plt.title('Predicted Vs. Observed Homicide Rate (per 100k) by Chicago Neighborhood')

neighborhoods = ()
for i in range(77):
    neighborhoods += ("",)

plt.xticks(ind + width / 2, neighborhoods)
plt.legend(loc='best')
plt.show()

In [34]:
homicide_rate_per_100k = strong_correlates['Homicide_rate_per_100k']

def outlier_analysis(dfseries):
    q = dfseries.quantile([.25, .5, .75])
    print('\nQuartiles:')
    print(q)

    print('\nInterquartile Range:')
    iqr = q[0.75] - q[0.25]
    print(iqr)

    print('\nOutlier Boundaries:')
    lb = q[0.25] - (1.5*iqr)
    ub = q[0.75] + (1.5*iqr)
    print('Lower = ' + str(lb) + ';', 'Upper = ' + str(ub))

    outlier_values = []
    for rate in dfseries:
        if rate <= lb or rate >= ub:
            outlier_values.append(rate)

    print('\nOutlier Values:')
    if len(outlier_values) == 0:
        print('There are no outlier values.')
    elif len(outlier_values) == 1:
        print(f'There is {len(outlier_values)} outlier value.\nThe outlier value is:')
        print(outlier_values)       
    else:
        print(f'There are {len(outlier_values)} outlier values.\nThe outlier values are:')
        print(outlier_values)
    print()
    print()
    print()
    
outlier_analysis(homicide_rate_per_100k)
outlier_analysis(strong_correlates['Homicide'])
strong_correlates['Homicide'].describe()


Quartiles:
0.25     4.9
0.50    10.8
0.75    32.2
Name: Homicide_rate_per_100k, dtype: float64

Interquartile Range:
27.300000000007596

Outlier Boundaries:
Lower = -36.05000000001232; Upper = 73.15000000001807

Outlier Values:
There are no outlier values.




Quartiles:
0.25     9.0
0.50    21.0
0.75    59.0
Name: Homicide, dtype: float64

Interquartile Range:
50.0

Outlier Boundaries:
Lower = -66.0; Upper = 134.0

Outlier Values:
There are 5 outlier values.
The outlier values are:
[238, 150, 155, 135, 155]





count     77.000000
mean      41.922078
std       48.766837
min        0.000000
25%        9.000000
50%       21.000000
75%       59.000000
max      238.000000
Name: Homicide, dtype: float64

In [35]:
#Removing outliers

outliers_removed = strong_correlates[strong_correlates.Homicide < 134]
outliers_removed

Unnamed: 0,Community Area Name,Homicide_rate_per_100k,Homicide,Violence,Property Crimes,Breast cancer in females,Cancer (All Sites),Colorectal Cancer,Diabetes-related,Lung Cancer,Prostate Cancer in Males,Stroke (Cerebrovascular Disease),Birth Rate,General Fertility Rate,Low Birth Weight,Preterm Births,Teen Birth Rate,Below Poverty Level,Dependency,Unemployment
0,Albany Park,4.7,21,6116,9053,22.9,158.1,16.8,72.1,36.9,13.1,39.1,18.3,76.5,8.5,8.3,44.5,17.1,32.1,9.0
1,Archer Heights,16.6,9,1856,4547,25.2,166.3,9.0,67.7,49.6,20.5,41.8,18.1,80.0,8.7,10.0,50.3,13.0,40.5,14.2
2,Armour Square,1.8,5,1886,4155,10.7,162.9,23.1,42.5,54.3,17.2,38.7,11.5,57.1,12.4,11.8,16.2,35.8,37.9,11.6
3,Ashburn,12.4,20,6165,10392,37.2,229.3,22.8,80.1,62.8,44.5,47.4,14.7,69.0,9.0,11.3,38.3,9.5,36.7,8.8
4,Auburn Gresham,37.6,117,23178,26566,41.9,243.0,24.5,83.6,65.1,43.5,63.7,15.1,70.5,11.6,13.9,83.1,24.5,42.1,24.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,West Lawn,10.7,15,3800,10176,16.9,145.1,15.7,61.5,36.7,19.7,43.1,18.8,83.3,7.6,8.3,44.6,15.3,41.9,7.8
73,West Pullman,43.9,91,14241,14565,20.3,263.6,32.6,83.4,78.6,62.9,63.9,15.3,71.2,14.9,14.4,67.8,24.3,42.2,17.0
74,West Ridge,5.8,15,7534,14923,20.2,155.9,17.3,60.5,36.0,14.2,34.7,17.3,83.3,8.1,8.3,29.9,15.1,38.3,7.9
75,West Town,8.5,47,15444,40496,14.5,139.6,12.4,107.0,27.4,16.6,33.3,18.8,60.4,9.1,10.8,49.2,15.7,22.9,6.0


In [36]:
#Multiple Regression: Regressing Homicide on Teen Birth & Economic Factors

xs = outliers_removed[['Teen Birth Rate', 'Unemployment']]
y = outliers_removed['Homicide']

multiple_regression(xs, y)


                            OLS Regression Results                            
Dep. Variable:               Homicide   R-squared:                       0.525
Model:                            OLS   Adj. R-squared:                  0.512
Method:                 Least Squares   F-statistic:                     38.20
Date:                Mon, 27 Jul 2020   Prob (F-statistic):           6.80e-12
Time:                        19:58:24   Log-Likelihood:                -332.43
No. Observations:                  72   AIC:                             670.9
Df Residuals:                      69   BIC:                             677.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -10.0540      6.707     

0     32.982960
1     35.928583
2      0.107621
3     26.289658
4     65.423543
        ...    
72    33.886806
73    53.342275
74    17.637396
75    40.141590
76    34.758768
Length: 72, dtype: float64

In [39]:
"""
Limitations:

Correlational and a cross_sectional non-experimental study
Drawing inferences about individuals based on aggregated data (e.g., aggregated at neighborhood and state level)
Well-being measure: 
  need more measures of health factors
  need psychological measures
Need more measures economic measures


Future Directions:

Hierarchical Regression Analysis 
Exploratory Factor Analysis
Individual-level data
Different crime studies (data available)


"""

'\nLimitations:\n\nCorrelational and a cross_sectional non-experimental study\nDrawing inferences about individuals based on aggregated data (e.g., aggregated at neighborhood and state level)\nWell-being measure: \n  need more measures of health factors\n  need psychological measures\nNeed more measures economic measures\n\n\nFuture Directions:\n\nHierarchical Regression Analysis \nExploratory Factor Analysis\nIndividual-level data\nDifferent crime studies (data available)\n\n\n'