# Analysis using the data collected in previous ipynp

# Imports

In [3]:
# Initial imports
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import preprocessing

%matplotlib inline

# Reading in CSV Files

### Reading in and correctly indexing the Crime DataFrame

In [4]:
# Reading in crime + Population csv
crime_path = Path("../Project-1/resources/crime_out.csv")
crime_data = pd.read_csv(crime_path, index_col="city_description", infer_datetime_format=True, parse_dates=True)

# Removing irrelevant columns
crime_data = crime_data.drop(columns= ["actual_murder", "actual_index_violent"])
crime_data

Unnamed: 0_level_0,date,cbsa_code,population,actual_all_crimes
city_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Austin-Round Rock, TX Metro Area",2014-01-01,12420,1941049,6686.0
"Charlotte-Concord-Gastonia, NC-SC Metro Area",2014-01-01,16740,2373749,8141.0
"Columbia, SC Metro Area",2014-01-01,17900,804684,3172.0
"Dallas-Fort Worth-Arlington, TX Metro Area",2014-01-01,19100,6945276,23204.0
"Las Vegas-Henderson-Paradise, NV Metro Area",2014-01-01,29820,2066423,8875.0
...,...,...,...,...
"Nashville-Davidson--Murfreesboro--Franklin, TN Metro Area",2018-12-01,34980,1942634,7540.0
"Orlando-Kissimmee-Sanford, FL Metro Area",2018-12-01,36740,2568290,0.0
"Phoenix-Mesa-Scottsdale, AZ Metro Area",2018-12-01,38060,4876829,16519.0
"Riverside-San Bernardino-Ontario, CA Metro Area",2018-12-01,40140,4598111,13504.0


In [5]:
# To get only month_year and city_description as index
crime_data['month_year'] = pd.to_datetime(crime_data['date']).dt.to_period('M')
crime_data = crime_data.reset_index()
crime_data = crime_data.set_index(['month_year', 'city_description'])
crime_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,cbsa_code,population,actual_all_crimes
month_year,city_description,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01,"Austin-Round Rock, TX Metro Area",2014-01-01,12420,1941049,6686.0
2014-01,"Charlotte-Concord-Gastonia, NC-SC Metro Area",2014-01-01,16740,2373749,8141.0
2014-01,"Columbia, SC Metro Area",2014-01-01,17900,804684,3172.0
2014-01,"Dallas-Fort Worth-Arlington, TX Metro Area",2014-01-01,19100,6945276,23204.0
2014-01,"Las Vegas-Henderson-Paradise, NV Metro Area",2014-01-01,29820,2066423,8875.0


### Reading in and correctly indexing the Cumulative Retunes DataFrame

In [6]:
# Reading in Cumulative Returns csv
returns_path = Path("../Project-1/resources/returns_by_city_out.csv")
returns_data = pd.read_csv(returns_path, index_col="date", infer_datetime_format=True, parse_dates=True)

# Formatting column names
returns_data = returns_data.rename(columns={
    "Unnamed: 1": "city_description",
    "0": "cum_returns"
})

returns_data = returns_data.reset_index()
returns_data = returns_data.set_index(['city_description'])

returns_data

Unnamed: 0_level_0,date,cum_returns
city_description,Unnamed: 1_level_1,Unnamed: 2_level_1
"Phoenix-Mesa-Scottsdale, AZ Metro Area",2014-01-31,0.009568
"Riverside-San Bernardino-Ontario, CA Metro Area",2014-01-31,0.014861
"Orlando-Kissimmee-Sanford, FL Metro Area",2014-01-31,0.009348
"Dallas-Fort Worth-Arlington, TX Metro Area",2014-01-31,0.005632
"Austin-Round Rock, TX Metro Area",2014-01-31,0.006739
...,...,...
"Tampa-St. Petersburg-Clearwater, FL Metro Area",2018-12-31,0.005913
"Las Vegas-Henderson-Paradise, NV Metro Area",2018-12-31,0.004245
"Charlotte-Concord-Gastonia, NC-SC Metro Area",2018-12-31,0.004407
"Columbia, SC Metro Area",2018-12-31,0.005442


In [7]:
# To get only month_year and city_description as index
returns_data['month_year'] = pd.to_datetime(returns_data['date']).dt.to_period('M')
returns_data = returns_data.reset_index()
returns_data = returns_data.set_index(['month_year', 'city_description'])
returns_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,cum_returns
month_year,city_description,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01,"Phoenix-Mesa-Scottsdale, AZ Metro Area",2014-01-31,0.009568
2014-01,"Riverside-San Bernardino-Ontario, CA Metro Area",2014-01-31,0.014861
2014-01,"Orlando-Kissimmee-Sanford, FL Metro Area",2014-01-31,0.009348
2014-01,"Dallas-Fort Worth-Arlington, TX Metro Area",2014-01-31,0.005632
2014-01,"Austin-Round Rock, TX Metro Area",2014-01-31,0.006739


# Joining DataFrames

In [8]:
combined_df = pd.concat([crime_data,returns_data], axis='columns', join='inner')
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,date,cbsa_code,population,actual_all_crimes,date,cum_returns
month_year,city_description,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01,"Austin-Round Rock, TX Metro Area",2014-01-01,12420,1941049,6686.0,2014-01-31,0.006739
2014-01,"Charlotte-Concord-Gastonia, NC-SC Metro Area",2014-01-01,16740,2373749,8141.0,2014-01-31,0.006073
2014-01,"Columbia, SC Metro Area",2014-01-01,17900,804684,3172.0,2014-01-31,0.002170
2014-01,"Dallas-Fort Worth-Arlington, TX Metro Area",2014-01-01,19100,6945276,23204.0,2014-01-31,0.005632
2014-01,"Las Vegas-Henderson-Paradise, NV Metro Area",2014-01-01,29820,2066423,8875.0,2014-01-31,0.010203
...,...,...,...,...,...,...,...
2018-12,"Nashville-Davidson--Murfreesboro--Franklin, TN Metro Area",2018-12-01,34980,1942634,7540.0,2018-12-31,0.002739
2018-12,"Orlando-Kissimmee-Sanford, FL Metro Area",2018-12-01,36740,2568290,0.0,2018-12-31,0.004526
2018-12,"Phoenix-Mesa-Scottsdale, AZ Metro Area",2018-12-01,38060,4876829,16519.0,2018-12-31,0.004092
2018-12,"Riverside-San Bernardino-Ontario, CA Metro Area",2018-12-01,40140,4598111,13504.0,2018-12-31,-0.000452


In [9]:
# Dropping irrelevant columns
combined_df = combined_df.drop(columns= ["date", "cbsa_code"])
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,population,actual_all_crimes,cum_returns
month_year,city_description,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01,"Austin-Round Rock, TX Metro Area",1941049,6686.0,0.006739
2014-01,"Charlotte-Concord-Gastonia, NC-SC Metro Area",2373749,8141.0,0.006073
2014-01,"Columbia, SC Metro Area",804684,3172.0,0.002170
2014-01,"Dallas-Fort Worth-Arlington, TX Metro Area",6945276,23204.0,0.005632
2014-01,"Las Vegas-Henderson-Paradise, NV Metro Area",2066423,8875.0,0.010203
...,...,...,...,...
2018-12,"Nashville-Davidson--Murfreesboro--Franklin, TN Metro Area",1942634,7540.0,0.002739
2018-12,"Orlando-Kissimmee-Sanford, FL Metro Area",2568290,0.0,0.004526
2018-12,"Phoenix-Mesa-Scottsdale, AZ Metro Area",4876829,16519.0,0.004092
2018-12,"Riverside-San Bernardino-Ontario, CA Metro Area",4598111,13504.0,-0.000452


In [10]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 600 entries, (Period('2014-01', 'M'), 'Austin-Round Rock, TX Metro Area') to (Period('2018-12', 'M'), 'Tampa-St. Petersburg-Clearwater, FL Metro Area')
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   population         600 non-null    int64  
 1   actual_all_crimes  600 non-null    float64
 2   cum_returns        600 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 18.7+ KB


# Multiple Linear Regression

In [11]:
correlations = combined_df.corr(method='pearson')
correlations

Unnamed: 0,population,actual_all_crimes,cum_returns
population,1.0,0.419031,0.181291
actual_all_crimes,0.419031,1.0,0.095901
cum_returns,0.181291,0.095901,1.0


In [16]:
predictors = ['population', 'actual_all_crimes']
outcome = 'cum_returns'

x = pd.get_dummies(combined_df[predictors], drop_first=True)
y = combined_df[outcome]
train_x, valid_x, train_y, valid_y = train_test_split(x,y, test_size = 0.1, random_state=1)

housing_lm = LinearRegression()
housing_lm.fit(train_x, train_y)

LinearRegression()

In [17]:
print(pd.DataFrame({'Predictor': x.columns, 'coffecients': housing_lm.coef_}))

           Predictor   coffecients
0         population  3.014148e-10
1  actual_all_crimes  6.978044e-09


In [18]:
train_df = train_x.join(train_y)

formula = 'cum_returns ~ ' + '+'. join(predictors)
print(formula)

housing_lm = smf.ols(formula=formula, data=train_df).fit()
housing_lm.summary()

cum_returns ~ population+actual_all_crimes


0,1,2,3
Dep. Variable:,cum_returns,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.033
Method:,Least Squares,F-statistic:,10.07
Date:,"Mon, 28 Dec 2020",Prob (F-statistic):,5.09e-05
Time:,22:35:10,Log-Likelihood:,2388.3
No. Observations:,540,AIC:,-4771.0
Df Residuals:,537,BIC:,-4758.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0048,0.000,18.865,0.000,0.004,0.005
population,3.014e-10,7.9e-11,3.813,0.000,1.46e-10,4.57e-10
actual_all_crimes,6.978e-09,1.1e-08,0.634,0.527,-1.47e-08,2.86e-08

0,1,2,3
Omnibus:,63.209,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,173.41
Skew:,0.576,Prob(JB):,2.21e-38
Kurtosis:,5.526,Cond. No.,7250000.0
