In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')
pd.set_option('display.max_columns', 500)

In [60]:
def DataPrepPipeline(filename):
    df = pd.read_csv(filename)
    
    # replace NaN value with column mean
    for col in df:
        if df[col].isnull().sum() > 0:
            df[col].fillna(value=df[col].mean(), inplace=True)
    
    # convert categorical variables to dummies variables
    dummy_lst=[]
    for col in df:
        if df[col].dtypes == 'object':
            dummy_lst.append(col)
    train_set = pd.get_dummies(df, columns=dummy_lst)
    
    return train_set

In [84]:
def RFRegTrain(filename, targetfile):
    # load the data
    df = DataPrepPipeline(filename)
    df_labels = pd.read_csv(targetfile)
    df = df.merge(df_labels, on='row_id')
    
    # train_test_split
    target='poverty_rate'
    y = train_set[target]
    X = train_set.drop(labels = ['row_id', target], axis=1)
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33)
    
    # RF Regressor
    from sklearn.ensemble import RandomForestRegressor
    reg = RandomForestRegressor(n_estimators=500, verbose=True)
    reg.fit(X_train, y_train)
    print('RF Model Trained!')
    
    from sklearn.ensemble import GradientBoostingRegressor
    gb_reg = GradientBoostingRegressor(n_estimators=500, verbose=True)
    gb_reg.fit(X_train, y_train)
    print('GB Model Trained!')
    
    # RMSE
    from sklearn.metrics import mean_squared_error
    print('The RF RMSE score is %.2f' % np.sqrt(mean_squared_error(y_test, reg.predict(X_test))))
    print('The GB RMSE score is %.2f' % np.sqrt(mean_squared_error(y_test, gb_reg.predict(X_test))))
       
    return reg, gb_reg

In [86]:
reg, gb_reg = RFRegTrain('train_values.csv', 'train_labels.csv')

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   18.7s finished


RF Model Trained!
      Iter       Train Loss   Remaining Time 
         1          39.6198            3.00s
         2          35.1774            3.25s
         3          31.4448            3.16s
         4          28.2972            2.98s
         5          25.6384            3.08s
         6          23.3585            3.05s
         7          21.4309            3.04s
         8          19.7803            3.08s
         9          18.3472            3.06s
        10          17.1529            3.10s
        20          10.6973            2.86s
        30           8.5229            2.75s
        40           7.3106            2.61s
        50           6.5342            2.43s
        60           6.0104            2.32s
        70           5.5907            2.22s
        80           5.2813            2.12s
        90           5.0254            2.02s
       100           4.7805            1.95s
       200           3.1584            1.37s
       300           2.2822         

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


In [88]:
test_set = DataPrepPipeline('test_values.csv')
X_test = test_set.drop('row_id', axis=1)

output = pd.DataFrame(test_set['row_id'])
output['poverty_rate'] = gb_reg.predict(X_test)

In [89]:
output.to_csv('output.csv', index=False)

# Final: 85 PTS