In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (7, 4.5)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [None]:
from sklearn.ensemble import RandomForestRegressor
#use regressor for numerical values, and classifier for string labels

# use RF on every var as y, sum up importance across rows
# for city, province and region, make them mutually exclusive when fitting
# column is the Y value used

data1 = pd.read_csv("OutliersRemoved2.csv")
data1 = data1.drop(data1.columns[[0,3,5,6]],axis = 1)
# remove chinese name, province key,region key, year
features = data1.describe()
locations = ['City/province name (EN)','Province','Region']

# imputation by subbing with mean on data
for i in list(data1):
    # cannot impute non-numerical data
    if i not in locations:
        data1[i] = data1[i].fillna(features[i].loc["mean"])
    
# shuffle 
data1 = shuffle(data1, random_state = 0)

# separate loop for the city, province and region
importances = pd.DataFrame(data =0,index = data1.columns,columns = data1.columns)

In [None]:
# takes a whole night to run; run it on a separate notebook to prevent computer crashing (memory)
rf2 = RandomForestRegressor(n_estimators=10)

# drop the non-numericals
X3 = data1.drop(locations,axis=1)


for y_val in list(X3):
    Y3 = X3[y_val]
    X3 = X3.drop(y_val, axis =1)
    X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y3, test_size=0.20, random_state = 0)
    rf2.fit(X3_train, Y3_train)
    run = rf2.feature_importances_ 

    d = dict(zip(X3.columns,run))
    for x in X3.columns:
        importances.loc[x,y_val] = d[x]

In [None]:
# unlooped for the three non-numerical to improve runtime

#predict city
X4 = data1.drop(['Province','Region'],axis=1)
Y4 = X4["City/province name (EN)"]
X4 = X4.drop("City/province name (EN)", axis =1)
X4_train, X4_test, Y4_train, Y4_test = train_test_split(X4, Y4, test_size=0.20, random_state = 0)
rf2.fit(X4_train, Y4_train)
run = rf2.feature_importances_ 

In [None]:
# paste information into importances df
d = dict(zip(X4.columns,run))
for x in X4.columns:
    if (x != "City/province name (EN)"):
        importances.loc[x,"City/province name (EN)"] = d[x]


In [None]:
# predict provinces
X4 = data1.drop(['City/province name (EN)','Region'],axis=1)
Y4 = X4["Province"]
X4 = X4.drop("Province", axis = 1)
X4_train, X4_test, Y4_train, Y4_test = train_test_split(X4, Y4, test_size=0.20, random_state = 0)
rf2.fit(X4_train, Y4_train)
run = rf2.feature_importances_ 

imp = pd.DataFrame( 
        run , 
        columns = [ "Province" ] , 
        index = X4.columns)



In [None]:
# paste information into importances df
d = dict(zip(X4.columns,run))
for x in X4.columns:
    if (x !=  "Province" ):
        importances.loc[x, "Province" ] = d[x]


In [None]:
# predict region
X4 = data1.drop(['City/province name (EN)','Province'],axis=1)
Y4 = X4["Region"]
X4 = X4.drop("Region", axis =1)
X4_train, X4_test, Y4_train, Y4_test = train_test_split(X4, Y4, test_size=0.20, random_state = 0)
rf2.fit(X4_train, Y4_train)
run = rf2.feature_importances_ 

imp = pd.DataFrame( 
        run , 
        columns = [ "Region" ] , 
        index = X4.columns )

In [None]:
# paste information into importances df
d = dict(zip(X4.columns,run))
for x in X4.columns:
    if (x != "Region"):
        importances.loc[x,"Region"] = d[x]


In [None]:
# sum across rows and place into imp_sum
imp_sum = pd.DataFrame(data = importances.sum(axis=1),index = data1.columns,columns = ["Importance"])

In [None]:
# graph the sums to show which features are the most important for variation
# graph the 3 classifiers and the top 10 regressors separately

# 3 classifiers
classifiers = imp_sum.drop(imp_sum.index[3::])
classifiers = classifiers.sort_values( [ 'Importance' ] , ascending = True )
classifiers.plot( kind = 'barh' )

In [None]:
# top 10 regressors
regressors = imp_sum.drop(imp_sum.index[0:3])
regressors = regressors.sort_values( [ 'Importance' ] , ascending = True )
regressors[:10].plot( kind = 'barh' )

In [None]:
print("use feature scaling on train/test for logreg")


In [None]:
print("using regularization within logreg")

In [None]:
print("try clustering for visualzation.")