In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [2]:
crime_df = pd.read_csv("../data/combined_data_total.csv")[["State", "Year", "Population", "Violent Crime"]]
crime_df

Unnamed: 0,State,Year,Population,Violent Crime
0,ALABAMA,2011,1201968,10782
1,ALASKA,2011,593910,4776
2,ARIZONA,2011,6504102,27034
3,ARKANSAS,2011,389976,5810
4,CALIFORNIA,2011,36797192,174798
...,...,...,...,...
301,TEXAS,2018,19553606,46274
302,UTAH,2018,1146286,2902
303,VIRGINIA,2018,2758044,4226
304,WASHINGTON,2018,1212192,2658


In [3]:
crime_df2018 = crime_df[crime_df["Year"]==2018].reset_index()
states2018 = list(crime_df2018["State"].values)
states2018

['ARIZONA',
 'ARKANSAS',
 'CALIFORNIA',
 'COLORADO',
 'CONNECTICUT',
 'FLORIDA',
 'GEORGIA',
 'HAWAII',
 'IDAHO',
 'ILLINOIS',
 'INDIANA',
 'KANSAS',
 'KENTUCKY',
 'LOUISIANA',
 'MARYLAND',
 'MASSACHUSETTS',
 'MICHIGAN',
 'MINNESOTA',
 'MISSOURI',
 'MONTANA',
 'NEBRASKA',
 'NEVADA',
 'NEW HAMPSHIRE',
 'NEW JERSEY',
 'NEW MEXICO',
 'NEW YORK',
 'NORTH DAKOTA',
 'OHIO',
 'OKLAHOMA',
 'OREGON',
 'PENNSYLVANIA',
 'RHODE ISLAND',
 'SOUTH CAROLINA',
 'SOUTH DAKOTA',
 'TENNESSEE',
 'TEXAS',
 'UTAH',
 'VIRGINIA',
 'WASHINGTON',
 'WISCONSIN']

In [4]:
# Generate our categorical variable lists
crime_cat = crime_df.dtypes[crime_df.dtypes == "object"].index.tolist()
crime_cat

['State']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(crime_df[crime_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crime_cat)
encode_df.head()

Unnamed: 0,State_ALABAMA,State_ALASKA,State_ARIZONA,State_ARKANSAS,State_CALIFORNIA,State_COLORADO,State_CONNECTICUT,State_DISTRICT OF COLUMBIA,State_FLORIDA,State_GEORGIA,...,State_PENNSYLVANIA,State_RHODE ISLAND,State_SOUTH CAROLINA,State_SOUTH DAKOTA,State_TENNESSEE,State_TEXAS,State_UTAH,State_VIRGINIA,State_WASHINGTON,State_WISCONSIN
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
crime_df = crime_df.merge(encode_df,left_index=True, right_index=True)
crime_df = crime_df.drop(crime_cat,1)
crime_df

Unnamed: 0,Year,Population,Violent Crime,State_ALABAMA,State_ALASKA,State_ARIZONA,State_ARKANSAS,State_CALIFORNIA,State_COLORADO,State_CONNECTICUT,...,State_PENNSYLVANIA,State_RHODE ISLAND,State_SOUTH CAROLINA,State_SOUTH DAKOTA,State_TENNESSEE,State_TEXAS,State_UTAH,State_VIRGINIA,State_WASHINGTON,State_WISCONSIN
0,2011,1201968,10782,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2011,593910,4776,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2011,6504102,27034,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,389976,5810,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011,36797192,174798,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,2018,19553606,46274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
302,2018,1146286,2902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
303,2018,2758044,4226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
304,2018,1212192,2658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Remove target from features data
y = crime_df.Population.values
X = crime_df.drop(columns=["Population","Violent Crime"], axis=1).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=300, random_state=100) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)

In [9]:
y_pred

array([  601739.44666667,  3733295.37333333,  2205176.02      ,
        5319479.76666667,  2200156.17333333,  1114788.8       ,
         679808.37333333,  9374016.79333333,  5571683.12666667,
        1846328.68      ,  1532338.3       ,  1586788.84666667,
        7097474.48      ,  1368583.16666667,  1736868.36      ,
       24354910.77333333,  2740541.24      ,  1417665.05333333,
        1168130.14666667,  1846998.72      ,  1871719.72      ,
        1178521.3       ,  4884083.66      ,   573291.20666667,
        1162781.87333333,  2593994.02      ,   307147.32      ,
         664305.57333333, 13151801.58666667,  1526179.        ,
        1159050.42      ,   315790.77333333,  3424158.94666667,
         275380.76      ,  2251326.93333333,  1008257.62      ,
        4811684.84      ,  4880574.46      ,  3946221.46666667,
        4373121.3       ,  3060009.32      ,  6487999.19333333,
        2649143.38666667,  9156758.08      ,  1818645.36      ,
        2318429.14666667,  2229751.89333

In [10]:
population_pred_df = crime_df[crime_df["Year"]==2018]
population_pred_df["Year"] = population_pred_df["Year"]+10
prediction_table_df = population_pred_df.drop(["Violent Crime", "Population"],axis=1)
pop_predictions = rf_model.predict(prediction_table_df.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
pop_predictions

array([ 7102664.71333333,   573291.20666667, 35024617.31333333,
        4592906.18      ,  1026965.31333333,  9372123.03333333,
        1595485.83333333,  1894246.57333333,   658723.42666667,
        7051701.34666667,  1346053.64      ,  1029083.34666667,
         814306.1       ,  1158428.81333333,  1143720.38      ,
        1111957.12      ,  2542939.38666667,  1533777.66666667,
        2289507.72666667,   369840.62      ,  1162781.87333333,
        4884083.66      ,   298122.66666667,  2029607.26      ,
        1146645.15333333,  1803223.02666667,   301625.94666667,
        2114681.1       ,  2542892.09333333,  1443102.21333333,
        3424158.94666667,   664305.57333333,   571212.56666667,
         427501.62      ,  3970568.65333333, 20070083.        ,
         991539.08666667,  2776498.69333333,  1919323.68      ,
        1849546.12666667])

In [12]:
prediction_table_df["Population"] = pop_predictions
year_pop_df = prediction_table_df[["Year","Population"]].reset_index().drop(["index"], axis=1)
year_pop_df

Unnamed: 0,Year,Population
0,2028,7102665.0
1,2028,573291.2
2,2028,35024620.0
3,2028,4592906.0
4,2028,1026965.0
5,2028,9372123.0
6,2028,1595486.0
7,2028,1894247.0
8,2028,658723.4
9,2028,7051701.0


In [13]:
year_pop_df["State"] = states2018
year_pop_df

Unnamed: 0,Year,Population,State
0,2028,7102665.0,ARIZONA
1,2028,573291.2,ARKANSAS
2,2028,35024620.0,CALIFORNIA
3,2028,4592906.0,COLORADO
4,2028,1026965.0,CONNECTICUT
5,2028,9372123.0,FLORIDA
6,2028,1595486.0,GEORGIA
7,2028,1894247.0,HAWAII
8,2028,658723.4,IDAHO
9,2028,7051701.0,ILLINOIS


In [14]:
clean_pop_2028_df = pd.DataFrame(year_pop_df["State"])
clean_pop_2028_df["Year"] = year_pop_df["Year"]
clean_pop_2028_df["Population"] = year_pop_df["Population"]
clean_pop_2028_df

Unnamed: 0,State,Year,Population
0,ARIZONA,2028,7102665.0
1,ARKANSAS,2028,573291.2
2,CALIFORNIA,2028,35024620.0
3,COLORADO,2028,4592906.0
4,CONNECTICUT,2028,1026965.0
5,FLORIDA,2028,9372123.0
6,GEORGIA,2028,1595486.0
7,HAWAII,2028,1894247.0
8,IDAHO,2028,658723.4
9,ILLINOIS,2028,7051701.0


In [15]:
clean_pop_2028_df.to_csv("population_pred_2028.csv")