In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

In [2]:
crime_df = pd.read_csv("../data/state_year_avg.csv")[["State", "Year", "Population", "Violent Crime"]]
crime_df

Unnamed: 0,State,Year,Population,Violent Crime
0,ALABAMA,2011,200328,1797
1,ALASKA,2011,296955,2388
2,ARIZONA,2011,361339,1502
3,ARKANSAS,2011,194988,2905
4,CALIFORNIA,2011,266646,1267
...,...,...,...,...
301,TEXAS,2018,279337,661
302,UTAH,2018,143286,363
303,VIRGINIA,2018,229837,352
304,WASHINGTON,2018,151524,332


In [3]:
len(set(list(crime_df["State"].values)))

46

In [4]:
pop2028 = pd.read_csv("population_pred_2028.csv").drop(["Unnamed: 0"], axis=1)
pop2028

Unnamed: 0,State,Year,Population
0,ARIZONA,2028,378558.816667
1,ARKANSAS,2028,176977.16
2,CALIFORNIA,2028,293668.236667
3,COLORADO,2028,221890.436667
4,CONNECTICUT,2028,128024.43
5,FLORIDA,2028,178081.776667
6,GEORGIA,2028,242543.24
7,HAWAII,2028,966734.036667
8,IDAHO,2028,175753.353333
9,ILLINOIS,2028,475493.36


In [5]:
states40 = list(pop2028["State"].values)
len(states40)

40

In [6]:
#crime_df[crime_df["State"]]
#removelist = ['ayside','rrowview']
#testing = np.where(crime_df["State"].str.contains('|'.join(states40)),1,0)
#testing

cleaned = crime_df[crime_df['State'].isin(states40)]
cleaned

Unnamed: 0,State,Year,Population,Violent Crime
2,ARIZONA,2011,361339,1502
3,ARKANSAS,2011,194988,2905
4,CALIFORNIA,2011,266646,1267
5,COLORADO,2011,209431,948
6,CONNECTICUT,2011,126592,1116
...,...,...,...,...
301,TEXAS,2018,279337,661
302,UTAH,2018,143286,363
303,VIRGINIA,2018,229837,352
304,WASHINGTON,2018,151524,332


In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cleaned[["State"]]))

# Add the encoded variable names to the dataframe
#encode_df.columns = enc.get_feature_names(cleaned[["State"]])
encode_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
# Merge one-hot encoded features and drop the originals
cleaned.merge(encode_df,left_index=True, right_index=True)

Unnamed: 0,State,Year,Population,Violent Crime,0,1,2,3,4,5,...,30,31,32,33,34,35,36,37,38,39
2,ARIZONA,2011,361339,1502,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ARKANSAS,2011,194988,2905,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CALIFORNIA,2011,266646,1267,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,COLORADO,2011,209431,948,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,CONNECTICUT,2011,126592,1116,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,LOUISIANA,2018,181063,730,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
280,MARYLAND,2018,605436,5143,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
281,MASSACHUSETTS,2018,142059,429,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,MICHIGAN,2018,213453,1283,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# Merge one-hot encoded features and drop the originals
crime_df = cleaned.drop(["State"], axis=1).merge(encode_df,left_index=True, right_index=True)
#crime_df = cleaned.drop(["State"], axis=1)
crime_df

Unnamed: 0,Year,Population,Violent Crime,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
2,2011,361339,1502,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,194988,2905,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011,266646,1267,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2011,209431,948,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2011,126592,1116,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,2018,181063,730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
280,2018,605436,5143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
281,2018,142059,429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
282,2018,213453,1283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
# Remove loan status target from features data
y = crime_df["Violent Crime"].values
X = crime_df.drop(columns=["Violent Crime"], axis=1).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

# Create a StandardScaler instances
#scaler = StandardScaler()

# Fit the StandardScaler
#X_scaler = scaler.fit(X_train)

# Scale the data
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=300, random_state=100) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
#print(f" Random forest model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [12]:
encode_2028 = pd.DataFrame(enc.fit_transform(pop2028[["State"]]))

encode_2028 = cleaned.drop(["State"], axis=1).merge(encode_2028,left_index=True, right_index=True)

In [13]:
pred_2028 = rf_model.predict(encode_2028)
pred_2028

ValueError: Number of features of the model must match the input. Model n_features is 42 and input n_features is 43 

In [None]:
# population_pred_df = crime_df[crime_df["Year"]==2018]
# population_pred_df["Year"] = population_pred_df["Year"]+10
# prediction_table_df = population_pred_df.drop(["Violent Crime", "Population"],axis=1)
# pop_predictions = rf_model.predict(prediction_table_df.values)

In [None]:
# pop_predictions

In [None]:
# prediction_table_df["Population"] = pop_predictions
# year_pop_df = prediction_table_df[["Year","Population"]].reset_index().drop(["index"], axis=1)
# year_pop_df

In [None]:
# year_pop_df["State"] = states2018
# year_pop_df