In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [2]:
crime_df = pd.read_csv("../data/state_year_avg.csv")[["State", "Year", "Population", "Violent Crime"]]
crime_df

Unnamed: 0,State,Year,Population,Violent Crime
0,ALABAMA,2011,200328,1797
1,ALASKA,2011,296955,2388
2,ARIZONA,2011,361339,1502
3,ARKANSAS,2011,194988,2905
4,CALIFORNIA,2011,266646,1267
...,...,...,...,...
301,TEXAS,2018,279337,661
302,UTAH,2018,143286,363
303,VIRGINIA,2018,229837,352
304,WASHINGTON,2018,151524,332


In [3]:
crime_df2018 = crime_df[crime_df["Year"]==2018].reset_index()
states2018 = list(crime_df2018["State"].values)
states2018

['ARIZONA',
 'ARKANSAS',
 'CALIFORNIA',
 'COLORADO',
 'CONNECTICUT',
 'FLORIDA',
 'GEORGIA',
 'HAWAII',
 'IDAHO',
 'ILLINOIS',
 'INDIANA',
 'KANSAS',
 'KENTUCKY',
 'LOUISIANA',
 'MARYLAND',
 'MASSACHUSETTS',
 'MICHIGAN',
 'MINNESOTA',
 'MISSOURI',
 'MONTANA',
 'NEBRASKA',
 'NEVADA',
 'NEW HAMPSHIRE',
 'NEW JERSEY',
 'NEW MEXICO',
 'NEW YORK',
 'NORTH DAKOTA',
 'OHIO',
 'OKLAHOMA',
 'OREGON',
 'PENNSYLVANIA',
 'RHODE ISLAND',
 'SOUTH CAROLINA',
 'SOUTH DAKOTA',
 'TENNESSEE',
 'TEXAS',
 'UTAH',
 'VIRGINIA',
 'WASHINGTON',
 'WISCONSIN']

In [4]:
# Generate our categorical variable lists
crime_cat = crime_df.dtypes[crime_df.dtypes == "object"].index.tolist()
crime_cat

['State']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(crime_df[crime_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crime_cat)
encode_df.head()

Unnamed: 0,State_ALABAMA,State_ALASKA,State_ARIZONA,State_ARKANSAS,State_CALIFORNIA,State_COLORADO,State_CONNECTICUT,State_DISTRICT OF COLUMBIA,State_FLORIDA,State_GEORGIA,...,State_PENNSYLVANIA,State_RHODE ISLAND,State_SOUTH CAROLINA,State_SOUTH DAKOTA,State_TENNESSEE,State_TEXAS,State_UTAH,State_VIRGINIA,State_WASHINGTON,State_WISCONSIN
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
crime_df = crime_df.merge(encode_df,left_index=True, right_index=True)
crime_df = crime_df.drop(crime_cat,1)
crime_df

Unnamed: 0,Year,Population,Violent Crime,State_ALABAMA,State_ALASKA,State_ARIZONA,State_ARKANSAS,State_CALIFORNIA,State_COLORADO,State_CONNECTICUT,...,State_PENNSYLVANIA,State_RHODE ISLAND,State_SOUTH CAROLINA,State_SOUTH DAKOTA,State_TENNESSEE,State_TEXAS,State_UTAH,State_VIRGINIA,State_WASHINGTON,State_WISCONSIN
0,2011,200328,1797,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2011,296955,2388,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2011,361339,1502,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,194988,2905,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011,266646,1267,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,2018,279337,661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
302,2018,143286,363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
303,2018,229837,352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
304,2018,151524,332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Remove loan status target from features data
y = crime_df.Population.values
X = crime_df.drop(columns=["Population","Violent Crime"], axis=1).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

# Create a StandardScaler instances
#scaler = StandardScaler()

# Fit the StandardScaler
#X_scaler = scaler.fit(X_train)

# Scale the data
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=300, random_state=100) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
#print(f" Random forest model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [9]:
y_pred

array([296328.21      , 566623.03      , 219239.4       , 575382.25333333,
       224387.11333333, 213322.80333333, 164956.15666667, 218494.48      ,
       321344.06333333, 315317.61666667, 484085.95333333, 274702.86      ,
       379749.05666667, 141238.81333333, 112324.17      , 401534.16666667,
       241996.87      , 476347.25      , 214570.53666667, 165602.43      ,
       316126.93666667, 586389.78333333, 589316.19666667, 176977.16      ,
       328722.03666667, 266698.08666667, 111499.79333333, 173045.33666667,
       281458.03666667, 250875.97      , 592182.32666667, 113628.55333333,
       682497.20333333, 111416.53666667, 231274.27333333, 128226.61      ,
       581066.39666667, 322589.74666667, 328203.69      , 242241.48666667,
       331746.42      , 419908.94666667, 229003.02333333, 216494.44333333,
       476381.99666667, 597322.43333333, 156607.73      , 194106.30666667,
       190537.72      , 135505.24      , 268156.78      , 178081.77666667,
       127945.77666667, 1

In [10]:
population_pred_df = crime_df[crime_df["Year"]==2018]
population_pred_df["Year"] = population_pred_df["Year"]+10
prediction_table_df = population_pred_df.drop(["Violent Crime", "Population"],axis=1)
pop_predictions = rf_model.predict(prediction_table_df.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
pop_predictions

array([378558.81666667, 176977.16      , 293668.23666667, 221890.43666667,
       128024.43      , 178081.77666667, 242543.24      , 966734.03666667,
       175753.35333333, 475493.36      , 195513.66333333, 319097.92666667,
       342346.25666667, 177435.64333333, 584199.05      , 142588.48333333,
       204795.39333333, 268156.78      , 190537.72      , 112166.74333333,
       328722.03666667, 589316.19666667, 112449.73333333, 158615.06333333,
       397619.54333333, 181485.73666667, 124251.22      , 270796.71666667,
       322898.15      , 165949.37333333, 682497.20333333, 173045.33666667,
       134325.78333333, 178000.29      , 328914.31      , 298416.52666667,
       145033.9       , 190608.13333333, 168143.97666667, 314781.93      ])

In [12]:
prediction_table_df["Population"] = pop_predictions
year_pop_df = prediction_table_df[["Year","Population"]].reset_index().drop(["index"], axis=1)
year_pop_df

Unnamed: 0,Year,Population
0,2028,378558.816667
1,2028,176977.16
2,2028,293668.236667
3,2028,221890.436667
4,2028,128024.43
5,2028,178081.776667
6,2028,242543.24
7,2028,966734.036667
8,2028,175753.353333
9,2028,475493.36


In [13]:
year_pop_df["State"] = states2018
year_pop_df

Unnamed: 0,Year,Population,State
0,2028,378558.816667,ARIZONA
1,2028,176977.16,ARKANSAS
2,2028,293668.236667,CALIFORNIA
3,2028,221890.436667,COLORADO
4,2028,128024.43,CONNECTICUT
5,2028,178081.776667,FLORIDA
6,2028,242543.24,GEORGIA
7,2028,966734.036667,HAWAII
8,2028,175753.353333,IDAHO
9,2028,475493.36,ILLINOIS


In [14]:
clean_pop_2028_df = pd.DataFrame(year_pop_df["State"])
clean_pop_2028_df["Year"] = year_pop_df["Year"]
clean_pop_2028_df["Population"] = year_pop_df["Population"]
clean_pop_2028_df

Unnamed: 0,State,Year,Population
0,ARIZONA,2028,378558.816667
1,ARKANSAS,2028,176977.16
2,CALIFORNIA,2028,293668.236667
3,COLORADO,2028,221890.436667
4,CONNECTICUT,2028,128024.43
5,FLORIDA,2028,178081.776667
6,GEORGIA,2028,242543.24
7,HAWAII,2028,966734.036667
8,IDAHO,2028,175753.353333
9,ILLINOIS,2028,475493.36


In [15]:
clean_pop_2028_df.to_csv("population_pred_2028.csv")