In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [2]:
crime_df = pd.read_csv("../data/state_year_avg.csv")[["State", "Year", "Population", "Violent Crime"]]
crime_df

Unnamed: 0,State,Year,Population,Violent Crime
0,ALABAMA,2011,200328,1797
1,ALASKA,2011,296955,2388
2,ARIZONA,2011,361339,1502
3,ARKANSAS,2011,194988,2905
4,CALIFORNIA,2011,266646,1267
...,...,...,...,...
303,TEXAS,2018,279337,661
304,UTAH,2018,143286,363
305,VIRGINIA,2018,229837,352
306,WASHINGTON,2018,151524,332


In [3]:
# Generate our categorical variable lists
crime_cat = crime_df.dtypes[crime_df.dtypes == "object"].index.tolist()
crime_cat

['State']

In [4]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(crime_df[crime_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(crime_cat)
encode_df.head()

Unnamed: 0,State_ALABAMA,State_ALASKA,State_ARIZONA,State_ARKANSAS,State_CALIFORNIA,State_COLORADO,State_CONNECTICUT,State_DISTRICT OF COLUMBIA,State_FLORIDA,State_GEORGIA,...,State_PENNSYLVANIA,State_RHODE ISLAND,State_SOUTH CAROLINA,State_SOUTH DAKOTA,State_TENNESSEE,State_TEXAS,State_UTAH,State_VIRGINIA,State_WASHINGTON,State_WISCONSIN
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Merge one-hot encoded features and drop the originals
crime_df = crime_df.merge(encode_df,left_index=True, right_index=True)
crime_df = crime_df.drop(crime_cat,1)
crime_df

Unnamed: 0,Year,Population,Violent Crime,State_ALABAMA,State_ALASKA,State_ARIZONA,State_ARKANSAS,State_CALIFORNIA,State_COLORADO,State_CONNECTICUT,...,State_PENNSYLVANIA,State_RHODE ISLAND,State_SOUTH CAROLINA,State_SOUTH DAKOTA,State_TENNESSEE,State_TEXAS,State_UTAH,State_VIRGINIA,State_WASHINGTON,State_WISCONSIN
0,2011,200328,1797,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2011,296955,2388,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2011,361339,1502,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011,194988,2905,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2011,266646,1267,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303,2018,279337,661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
304,2018,143286,363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
305,2018,229837,352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
306,2018,151524,332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
# Remove loan status target from features data
y = crime_df.Population.values
X = crime_df.drop(columns=["Population","Violent Crime"], axis=1).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

# Create a StandardScaler instances
#scaler = StandardScaler()

# Fit the StandardScaler
#X_scaler = scaler.fit(X_train)

# Scale the data
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=300, random_state=100) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
print(f" Random forest model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest model accuracy: 0.000


In [8]:
y_pred

array([276978, 216363, 367917, 143898, 379429, 290664, 229947, 184898,
       548808, 110397, 277170, 167585, 121217, 319260, 123430, 128399,
       109997, 138321, 605436, 111101, 234356, 361895, 275581, 238776,
       210586, 364807, 148868, 282342, 492848, 361339, 500839, 621252,
       523226, 371649, 190246, 280298, 241436, 492848, 167339, 168017,
       995572, 162464, 158234, 464016, 198647, 212549, 210586, 484266,
       336598, 626848, 258883, 240705, 328627, 210586, 128399, 269822,
       229405, 336647, 183344, 124710, 106371, 214330, 299143, 199314,
       211569, 210586, 140350, 174170, 237527, 128399, 217123, 168591,
       227658, 216363, 918366, 128399, 461664])

In [9]:
population_pred_df = crime_df[crime_df["Year"]==2018]
population_pred_df["Year"] = population_pred_df["Year"]+10
prediction_table_df = population_pred_df.drop(["Violent Crime", "Population"],axis=1)
pop_predictions = rf_model.predict(prediction_table_df.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
prediction_table_df["Population"] = pop_predictions
year_pop_df = prediction_table_df[["Year","Population"]]
states_df = year_pop_df[[list(prediction_table_df.drop(["Year","Population"], axis=1).columns)]]
final_2029_df = pd.concat([year_pop_df, states_df])
final_2029_df

KeyError: "None of [Index([('State_ALABAMA', 'State_ALASKA', 'State_ARIZONA', 'State_ARKANSAS', 'State_CALIFORNIA', 'State_COLORADO', 'State_CONNECTICUT', 'State_DISTRICT OF COLUMBIA', 'State_FLORIDA', 'State_GEORGIA', 'State_HAWAII', 'State_IDAHO', 'State_ILLINOIS', 'State_INDIANA', 'State_IOWA', 'State_KANSAS', 'State_KENTUCKY', 'State_LOUISIANA', 'State_MARYLAND', 'State_MASSACHUSETTS', 'State_MICHIGAN', 'State_MICHIGAN5', 'State_MINNESOTA', 'State_MISSISSIPPI', 'State_MISSOURI', 'State_MISSOURI5', 'State_MONTANA', 'State_NEBRASKA', 'State_NEVADA', 'State_NEW HAMPSHIRE', 'State_NEW JERSEY', 'State_NEW MEXICO', 'State_NEW YORK', 'State_NORTH CAROLINA', 'State_NORTH DAKOTA', 'State_OHIO', 'State_OKLAHOMA', 'State_OREGON', 'State_PENNSYLVANIA', 'State_RHODE ISLAND', 'State_SOUTH CAROLINA', 'State_SOUTH DAKOTA', 'State_TENNESSEE', 'State_TEXAS', 'State_UTAH', 'State_VIRGINIA', 'State_WASHINGTON', 'State_WISCONSIN')], dtype='object')] are in the [columns]"