# Traditional non-DL method
1. Random Forest

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb

## Load and mormalize DataSet

In [2]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')
feature_name = train.columns.values[:-1].tolist()
train_feature_raw = train[feature_name]
train_label = train['label']
test_feature_raw = test[feature_name]
test_label = test['label']
#normalize
train_feature = ((train_feature_raw - train_feature_raw.mean()) / train_feature_raw.std()).values
test_feature = ((test_feature_raw - train_feature_raw.mean()) / train_feature_raw.std()).values
train_label = train_label.values
test_label = test_label.values

## Random Forest

In [3]:
rf_raw = RandomForestRegressor(10, max_depth=7, n_jobs= -1, random_state = 7)
rf_raw.fit(train_feature, train_label)
rf_pred=rf_raw.predict(test_feature)
((test_label-rf_pred)**2).mean()

0.783091017694738

## Adaboost

In [None]:
ada_raw = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=100,loss = 'square', random_state = 7)
ada_raw.fit(train_feature, train_label)
ada_pred=ada_raw.predict(test_feature)
((test_label-ada_pred)**2).mean()

## Xgboost

In [13]:
xgb_train = xgb.DMatrix(train_feature, label=train_label)
xgb_test = xgb.DMatrix(test_feature, label=test_label)

In [4]:
param_gbtree = {
    # General Parameters:
    'booster':'gbtree',
    # For GBtree
    'eta':0.01,
    'gamma': 0,
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.9,
    'lambda': 0.5,
    'alpha': 0,
    'tree_method': 'auto', 
    'num_parallel_tree': 1, # For Random Forest
    # Learning Task
#     'objective': 'reg:squarederror',
    'eval_metric': ['rmse']
}
#evallist = [(xgb_train, 'train'),(xgb_test, 'test')]

In [28]:
num_round = 500
bst_gbtree_raw = xgb.train(param_gbtree, xgb_train, num_round, evallist, early_stopping_rounds=10)

[0]	train-rmse:1.03676	test-rmse:1.04051
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 10 rounds.
[1]	train-rmse:1.03396	test-rmse:1.03955
[2]	train-rmse:1.03112	test-rmse:1.0389
[3]	train-rmse:1.02822	test-rmse:1.03814
[4]	train-rmse:1.02599	test-rmse:1.03739
[5]	train-rmse:1.02342	test-rmse:1.03611
[6]	train-rmse:1.02118	test-rmse:1.03551
[7]	train-rmse:1.01896	test-rmse:1.03325
[8]	train-rmse:1.01634	test-rmse:1.03097
[9]	train-rmse:1.01402	test-rmse:1.03036
[10]	train-rmse:1.01198	test-rmse:1.02958
[11]	train-rmse:1.00955	test-rmse:1.02922
[12]	train-rmse:1.00761	test-rmse:1.02839
[13]	train-rmse:1.00535	test-rmse:1.02631
[14]	train-rmse:1.00318	test-rmse:1.02587
[15]	train-rmse:1.00143	test-rmse:1.02591
[16]	train-rmse:0.999423	test-rmse:1.02503
[17]	train-rmse:0.997354	test-rmse:1.02335
[18]	train-rmse:0.995364	test-rmse:1.02271
[19]	train-rmse:0.993612	test-rmse:1.02075
[20]	train-rmse:0.991353

In [29]:
xgb_pred = bst_gbtree_raw.predict(xgb_test)
((test_label- xgb_pred)**2).mean()

0.927202910179172

## Dimension Reducction with PCA

In [6]:
from sklearn.decomposition import PCA
pca_99 = PCA(n_components=0.99)
pca_95 = PCA(n_components=0.95)
pca_99.fit(train_feature) 
pca_95.fit(train_feature)
train_feature_pca99 = pca_99.transform(train_feature) 
train_feature_pca95 = pca_95.transform(train_feature)
test_feature_pca99 = pca_99.transform(test_feature)
test_feature_pca95 = pca_95.transform(test_feature)

#Random Forest
#rf_pca99 = RandomForestRegressor(10, max_depth=7, n_jobs= -1, random_state = 7)
#rf_pca99.fit(train_feature_pca99, train_label)
#rf_pca99_pred= rf_pca99.predict(test_feature_pca99)
#print("RF_PCA99: %f" % ((test_label-rf_pca99_pred)**2).mean())

#rf_pca95 = RandomForestRegressor(10, max_depth=7, n_jobs= -1, random_state = 7)
#rf_pca95.fit(train_feature_pca95, train_label)
#rf_pca95_pred= rf_pca95.predict(test_feature_pca95)
#print("RF_PCA95: %f" % ((test_label-rf_pca95_pred)**2).mean())

#Adaboost
ada_pca99 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=20,loss = 'square', random_state = 7)
ada_pca99.fit(train_feature_pca99, train_label)
ada_pca99_pred=ada_pca99.predict(test_feature_pca99)
print("ADA_PCA99: %f" % ((test_label-ada_pca99_pred)**2).mean())

ada_pca95 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=20,loss = 'square', random_state = 7)
ada_pca95.fit(train_feature_pca95, train_label)
ada_pca95_pred=ada_pca95.predict(test_feature_pca95)
print("ADA_PCA95: %f" % ((test_label-ada_pca95_pred)**2).mean())


#xgboost
xgb_pca99_train = xgb.DMatrix(train_feature_pca99, label=train_label)
xgb_pca99_test = xgb.DMatrix(test_feature_pca99, label=test_label)
evallist = [(xgb_pca99_train, 'train'),(xgb_pca99_test, 'test')]
bst_gbtree_pca99 = xgb.train(param_gbtree, xgb_pca99_train, 500, evallist, early_stopping_rounds=10)
xgb_pca99_pred = bst_gbtree_pca99.predict(xgb_pca99_test)
print("XGB_PCA99: %f" % ((test_label-xgb_pca99_pred)**2).mean())

xgb_pca95_train = xgb.DMatrix(train_feature_pca95, label=train_label)
xgb_pca95_test = xgb.DMatrix(test_feature_pca95, label=test_label)
evallist = [(xgb_pca95_train, 'train'),(xgb_pca95_test, 'test')]
bst_gbtree_pca95 = xgb.train(param_gbtree, xgb_pca95_train, 500, evallist, early_stopping_rounds=10)
xgb_pca95_pred = bst_gbtree_pca95.predict(xgb_pca95_test)
print("XGB_PCA95: %f" % ((test_label-xgb_pca95_pred)**2).mean())

KeyboardInterrupt: 

In [7]:
train_feature_pca99.shape

(1376901, 85)

In [8]:
train_feature_pca95.shape

(1376901, 59)

In [14]:
print(pca_95.explained_variance_)

[138.99949652]


In [21]:
train_feature

array([[-0.0001879 , -0.0001879 , -0.0001879 , ..., -0.00018785,
        -0.0001879 , -0.0001879 ],
       [-0.0001879 , -0.0001879 , -0.0001879 , ..., -0.00018787,
        -0.0001879 , -0.0001879 ],
       [-0.0001879 , -0.0001879 , -0.0001879 , ..., -0.00018785,
        -0.0001879 , -0.0001879 ],
       ...,
       [-0.0001879 , -0.0001879 , -0.0001879 , ..., -0.00018785,
        -0.0001879 , -0.0001879 ],
       [-0.0001879 , -0.0001879 , -0.0001879 , ..., -0.00018788,
        -0.0001879 , -0.0001879 ],
       [-0.0001879 , -0.0001879 , -0.0001879 , ..., -0.00018786,
        -0.0001879 , -0.0001879 ]])

In [22]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')
feature_name = train.columns.values[:-1].tolist()
train_feature = train[feature_name]
train_label = train['label']

In [28]:
train_feature_value = train_feature.values
train_featureee = ((train_feature - train_feature.mean())/train_feature.std()).values

In [29]:
train_featureee

array([[-0.88496328,  0.62067379, -0.002088  , ...,  0.9796345 ,
        -0.16664853, -0.13855176],
       [ 0.93851154, -0.05079964, -0.002088  , ..., -0.54684079,
        -0.44532126, -0.41565155],
       [ 0.00339625,  0.45401888, -0.002088  , ...,  0.567173  ,
        -0.18353779, -0.15766209],
       ...,
       [ 0.84500001, -0.12091322, -0.002088  , ...,  0.72274993,
         0.34002915,  0.37742718],
       [ 0.93851154, -0.11011913, -0.002088  , ..., -1.26048582,
        -0.71554936, -0.70230652],
       [-0.89431443,  0.00343522, -0.002088  , ..., -0.11150177,
        -0.13287002, -0.10033109]])