In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
import xgboost as xgb


# Data Analysis Begins

- Make a correlation matrix

In [40]:
crime_dataframe = pd.read_excel('../data/tentative_final.xlsx')
crime_dataframe.head()


Unnamed: 0,County,Year,Population,crime_rate,clearance_rate,population_density,unemployment_rate,adjusted_income,poverty_rate_%,rent_burden,...,adj_health_budget,adj_judiciary_budget,adj_prison_budget,median_house_value,house_affordability,Category_Rural,Category_Suburban,Category_Urban,uninsured_rate,high_school_rate
0,Alameda,2010,1510271,0.006931,0.002266,2046.437669,11.2,294.98191,13.5,42.4,...,1.607682,0.516154,0.664076,590900,8.827704,0,0,1,0.132639,85.9
1,Alameda,2011,1527169,0.007062,0.002078,2069.334688,10.3,288.906538,13.2,42.3,...,1.603506,0.501708,0.649141,558300,8.296307,0,0,1,0.128512,86.0
2,Alameda,2012,1549193,0.007681,0.001902,2099.177507,8.8,294.803804,13.1,42.6,...,1.651499,0.498915,0.611663,514900,7.333818,0,0,1,0.123669,86.2
3,Alameda,2013,1575139,0.007453,0.001877,2134.334688,7.3,298.514628,13.0,42.6,...,1.663709,0.486929,0.600055,493800,6.846162,0,0,1,0.125603,41.8
4,Alameda,2014,1597747,0.006482,0.001927,2164.968835,5.9,312.921908,12.5,43.0,...,1.639288,0.495276,0.631766,509300,6.614629,0,0,1,0.117792,86.7


In [41]:
df = crime_dataframe.set_index(['County', 'Year'])

## Features to analyse

In [44]:
useful_features = [
    "Population", "crime_rate", "clearance_rate", "population_density", "Vacant_rate",
    "number_of_person_in_household", "mobile_home_ratio", "percent_in_poverty", 
    "adjusted_median_income", "unemployment_rate", "dropout_rate", "public_school_rate", 
    "no_highschool_rate", "uninsured_rate", "house_affordability", "adj_police_budget", 
    "adj_education_budget", "adj_welfare_budget", "adj_mental_health_budget", "adj_rehab_budget",
    "adj_health_budget", "adj_judiciary_budget", "adj_prison_budget", "median_age", 
    "home_ownership_rate", "rent_burden",
]

In [42]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,crime_rate,clearance_rate,population_density,unemployment_rate,adjusted_income,poverty_rate_%,rent_burden,home_ownership_rate,mobile_home_ratio,...,adj_health_budget,adj_judiciary_budget,adj_prison_budget,median_house_value,house_affordability,Category_Rural,Category_Suburban,Category_Urban,uninsured_rate,high_school_rate
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,2010,1510271,0.006931,0.002266,2046.437669,11.2,294.981910,13.5,42.4,0.551246,0.012387,...,1.607682,0.516154,0.664076,590900,8.827704,0,0,1,0.132639,85.9
Alameda,2011,1527169,0.007062,0.002078,2069.334688,10.3,288.906538,13.2,42.3,0.545447,0.012529,...,1.603506,0.501708,0.649141,558300,8.296307,0,0,1,0.128512,86.0
Alameda,2012,1549193,0.007681,0.001902,2099.177507,8.8,294.803804,13.1,42.6,0.537406,0.012231,...,1.651499,0.498915,0.611663,514900,7.333818,0,0,1,0.123669,86.2
Alameda,2013,1575139,0.007453,0.001877,2134.334688,7.3,298.514628,13.0,42.6,0.531967,0.012731,...,1.663709,0.486929,0.600055,493800,6.846162,0,0,1,0.125603,41.8
Alameda,2014,1597747,0.006482,0.001927,2164.968835,5.9,312.921908,12.5,43.0,0.528597,0.012210,...,1.639288,0.495276,0.631766,509300,6.614629,0,0,1,0.117792,86.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yuba,2012,73501,0.003877,0.001823,116.668254,15.6,174.671957,22.4,45.2,0.598765,0.102783,...,0.390209,0.443185,1.113337,180700,4.343854,1,0,0,0.153530,78.4
Yuba,2013,74114,0.003670,0.001822,117.641270,13.3,173.096104,20.6,48.6,0.590782,0.100490,...,0.365624,0.435113,1.052415,171000,4.088562,1,0,0,0.166095,13.7
Yuba,2014,74755,0.003960,0.001980,118.658730,11.3,168.267257,21.6,49.1,0.581094,0.105134,...,0.350782,0.439094,1.031046,170800,4.125305,1,0,0,0.161760,79.6
Yuba,2015,75374,0.004153,0.001818,119.641270,9.3,186.248828,21.6,48.9,0.580492,0.098604,...,0.326020,0.420023,1.068289,172100,3.701075,1,0,0,0.142650,80.8


In [43]:
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,Population,crime_rate,clearance_rate,population_density,unemployment_rate,adjusted_income,poverty_rate_%,rent_burden,home_ownership_rate,mobile_home_ratio,...,adj_health_budget,adj_judiciary_budget,adj_prison_budget,median_house_value,house_affordability,Category_Rural,Category_Suburban,Category_Urban,uninsured_rate,high_school_rate
Population,1.0,0.031237,-0.083329,0.683003,-0.086973,0.215441,-0.062095,0.163996,-0.409649,-0.318643,...,-0.107896,-0.066977,-0.253234,0.253774,0.287276,-0.381532,-0.159638,0.625795,0.110144,-0.026869
crime_rate,0.031237,1.0,0.69739,-0.040181,0.038881,-0.297662,0.326736,-0.14318,-0.049374,0.150003,...,0.294611,0.239994,0.16415,-0.228848,-0.188851,0.054411,-0.075173,0.019129,-0.046893,0.001742
clearance_rate,-0.083329,0.69739,1.0,-0.229988,0.066338,-0.402488,0.348018,-0.086861,0.028528,0.355839,...,0.128686,0.181582,0.203546,-0.346056,-0.243294,0.195655,-0.009929,-0.219343,-0.000855,-0.047193
population_density,0.683003,-0.040181,-0.229988,1.0,-0.193067,0.540662,-0.289989,0.072816,-0.356746,-0.454358,...,-0.114658,-0.096118,-0.315254,0.495672,0.395311,-0.452231,-0.160968,0.710496,-0.060155,0.055176
unemployment_rate,-0.086973,0.038881,0.066338,-0.193067,1.0,-0.443592,0.547655,0.075893,-0.120608,0.096465,...,-0.090976,-0.075185,-0.117603,-0.457357,-0.238134,0.020031,0.101003,-0.135345,0.60437,-0.386842
adjusted_income,0.215441,-0.297662,-0.402488,0.540662,-0.443592,1.0,-0.781108,-0.096954,-0.066474,-0.627731,...,-0.096486,-0.08157,-0.22919,0.864905,0.535652,-0.479103,0.12172,0.429323,-0.381327,0.215629
poverty_rate_%,-0.062095,0.326736,0.348018,-0.289989,0.547655,-0.781108,1.0,0.268727,-0.240667,0.371008,...,-0.028127,-0.050151,-0.037615,-0.66605,-0.372751,0.121548,0.038748,-0.185965,0.545855,-0.347421
rent_burden,0.163996,-0.14318,-0.086861,0.072816,0.075893,-0.096954,0.268727,1.0,-0.225977,0.040651,...,-0.478228,-0.426968,-0.504707,-0.028809,0.152244,-0.285566,0.184092,0.132469,0.192017,-0.106431
home_ownership_rate,-0.409649,-0.049374,0.028528,-0.356746,-0.120608,-0.066474,-0.240667,-0.225977,1.0,0.248864,...,0.374972,0.182277,0.205205,-0.127639,-0.192865,0.528296,-0.300613,-0.289279,-0.226939,0.22296
mobile_home_ratio,-0.318643,0.150003,0.355839,-0.454358,0.096465,-0.627731,0.371008,0.040651,0.248864,1.0,...,-0.025111,0.080498,0.455246,-0.492318,-0.309468,0.495001,-0.156747,-0.409278,0.028036,-0.00356
