In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

In [2]:
crime_df = pd.read_csv("../data/state_year_avg.csv")[["State", "Year", "Population", "Violent Crime"]]
crime_df["State"].unique()

array(['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA',
       'COLORADO', 'CONNECTICUT', 'DISTRICT OF COLUMBIA', 'FLORIDA',
       'GEORGIA', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS',
       'KENTUCKY', 'LOUISIANA', 'MARYLAND', 'MASSACHUSETTS', 'MICHIGAN',
       'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 'NEBRASKA',
       'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO', 'NEW YORK',
       'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 'OREGON',
       'PENNSYLVANIA', 'SOUTH CAROLINA', 'SOUTH DAKOTA', 'TENNESSEE',
       'TEXAS', 'UTAH', 'VIRGINIA', 'WASHINGTON', 'WISCONSIN',
       'RHODE ISLAND', 'HAWAII'], dtype=object)

In [3]:
pop2028 = pd.read_csv("population_pred_2028.csv").drop(["Unnamed: 0"], axis=1)
pop2028

Unnamed: 0,State,Year,Population
0,ARIZONA,2028,378558.816667
1,ARKANSAS,2028,176977.16
2,CALIFORNIA,2028,293668.236667
3,COLORADO,2028,221890.436667
4,CONNECTICUT,2028,128024.43
5,FLORIDA,2028,178081.776667
6,GEORGIA,2028,242543.24
7,HAWAII,2028,966734.036667
8,IDAHO,2028,175753.353333
9,ILLINOIS,2028,475493.36


In [4]:
states40 = list(pop2028["State"].values)
len(states40)

40

In [5]:
cleaned = crime_df[crime_df['State'].isin(states40)]
cleaned

Unnamed: 0,State,Year,Population,Violent Crime
2,ARIZONA,2011,361339,1502
3,ARKANSAS,2011,194988,2905
4,CALIFORNIA,2011,266646,1267
5,COLORADO,2011,209431,948
6,CONNECTICUT,2011,126592,1116
...,...,...,...,...
301,TEXAS,2018,279337,661
302,UTAH,2018,143286,363
303,VIRGINIA,2018,229837,352
304,WASHINGTON,2018,151524,332


In [6]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cleaned[["State"]]))

# Add the encoded variable names to the dataframe
encode_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Merge one-hot encoded features and drop the originals
cleaned.merge(encode_df,left_index=True, right_index=True)

Unnamed: 0,State,Year,Population,Violent Crime,0,1,2,3,4,5,...,30,31,32,33,34,35,36,37,38,39
2,ARIZONA,2011,361339,1502,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ARKANSAS,2011,194988,2905,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CALIFORNIA,2011,266646,1267,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,COLORADO,2011,209431,948,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CONNECTICUT,2011,126592,1116,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,LOUISIANA,2018,181063,730,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
280,MARYLAND,2018,605436,5143,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
281,MASSACHUSETTS,2018,142059,429,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,MICHIGAN,2018,213453,1283,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
# Merge one-hot encoded features and drop the originals
crime_df = cleaned.drop(["State"], axis=1).merge(encode_df,left_index=True, right_index=True)
crime_df

Unnamed: 0,Year,Population,Violent Crime,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
2,2011,361339,1502,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,194988,2905,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011,266646,1267,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2011,209431,948,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2011,126592,1116,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,2018,181063,730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
280,2018,605436,5143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
281,2018,142059,429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,2018,213453,1283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# Remove target from features data
y = crime_df["Violent Crime"].values
X = crime_df.drop(columns=["Violent Crime"], axis=1).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [10]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=300, random_state=100) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)

In [11]:
y_pred

array([4674.43666667,  238.89666667,  229.48333333,  439.52333333,
       1720.72      , 1444.56666667, 1104.78      ,  399.53      ,
        709.79333333,  553.45      , 1408.06      ,  793.93666667,
        326.29      , 1119.18      , 1951.52      , 1749.98666667,
        440.42333333, 2896.11      ,  733.39      ,  428.83      ,
        280.29333333,  389.03666667,  463.9       ,  398.39333333,
       3338.37666667,  452.75666667, 1257.77333333,  380.07666667,
        469.62      , 1309.16      , 1395.67333333,  767.54333333,
       3295.18666667, 1341.51666667, 1432.10666667,  249.14666667,
        861.26666667,  930.26      ,  423.00666667,  948.38666667,
       1171.96333333,  655.08333333,  332.97      , 1604.86666667,
        444.21666667, 1266.41666667,  817.74666667,  909.4       ,
        399.23333333,  224.22      ,  390.88666667,  815.61333333,
       1152.75666667, 1819.27333333,  663.94      ,  325.61666667,
       1927.14      , 1754.56333333,  248.13      , 1150.47   

In [12]:
encode_2028 = pd.DataFrame(enc.fit_transform(pop2028[["State"]]))

encode_2028 = cleaned.drop(["State", "Violent Crime"], axis=1).merge(encode_2028,left_index=True, right_index=True)

encode_2028

Unnamed: 0,Year,Population,0,1,2,3,4,5,6,7,...,30,31,32,33,34,35,36,37,38,39
2,2011,361339,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,194988,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011,266646,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2011,209431,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2011,126592,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2011,208439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2011,240364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,2011,207945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2011,140350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,2011,158234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
pred_2028 = rf_model.predict(encode_2028)
pred_2028

array([1913.93666667, 2207.66333333, 1272.87      , 1171.96333333,
        961.36333333, 1168.11666667, 1497.96666667, 1501.05333333,
        790.25666667,  703.11666667, 1407.82      , 2505.33666667,
       1344.84666667, 5726.82666667,  902.61      , 1491.78333333,
       2483.53666667, 1625.89      ,  434.92333333, 2456.36666667,
       4663.22666667,  381.98666667, 1204.70333333, 3463.17333333,
       1421.29666667,  377.18666667, 1341.51666667, 2306.69333333,
       1359.09333333,  940.33      ,  443.31333333,  763.11      ,
       1714.02666667, 2278.87333333])

In [14]:
pop2028

Unnamed: 0,State,Year,Population
0,ARIZONA,2028,378558.816667
1,ARKANSAS,2028,176977.16
2,CALIFORNIA,2028,293668.236667
3,COLORADO,2028,221890.436667
4,CONNECTICUT,2028,128024.43
5,FLORIDA,2028,178081.776667
6,GEORGIA,2028,242543.24
7,HAWAII,2028,966734.036667
8,IDAHO,2028,175753.353333
9,ILLINOIS,2028,475493.36


In [20]:
# pop2028["Violent Crime"] = pred_2028
# year_vcrime_df = pop2028[["Year","Population","Violent Crime"]].reset_index().drop(["index"], axis=1)
# year_vcrime_df

In [21]:
# population_pred_df = crime_df[crime_df["Year"]==2018]
# population_pred_df["Year"] = population_pred_df["Year"]+10
# prediction_table_df = population_pred_df.drop(["Violent Crime", "Population"],axis=1)
# pop_predictions = rf_model.predict(prediction_table_df.values)

In [22]:
# pop_predictions

In [23]:
# prediction_table_df["Population"] = pop_predictions
# year_pop_df = prediction_table_df[["Year","Population"]].reset_index().drop(["index"], axis=1)
# year_pop_df

In [24]:
# year_pop_df["State"] = states2018
# year_pop_df