In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
#########################
# Transform data actions
#########################
def transform(df, cross=True, scaler=None):
    df = df.drop('id', axis=1)
    df = cat_to_cont(df)

    if cross:
        y = df.loss.values
        X = df.drop('loss', axis=1)
        X_train, X_cross, y_train, y_cross = train_test_split(X, y, test_size=0.2, random_state=42)

        scaler = preprocessing.StandardScaler().fit(X_train)

        X_train = scaler.transform(X_train)
        X_cross = scaler.transform(X_cross)

        y_train = np.log(y_train)

        return X_train, X_cross, y_train, y_cross, scaler
    else:
        X_test = cat_to_cont(df)
        X_test = X_test.as_matrix()

        X_test = scaler.transform(X_test)

        return X_test


In [3]:
#######################################
# Convert categorical to cont features
#######################################
def cat_to_cont(df):
    # Get categorical columns range
    for i in range(1, 117):
        col_name = "cat{}".format(i)
        df[col_name] = df[col_name].astype('category')

    # Convert categorical to cont
    cat_cols = df.select_dtypes(['category']).columns
    df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

    return df

In [4]:
def predict_transformed(X, est):
    return np.exp(est.predict(X))

In [5]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [6]:
X_train, X_cross, y_train, y_cross, scaler = transform(df)
    
est = GradientBoostingRegressor(n_estimators=250, learning_rate=0.1, max_depth=6,
                                    subsample=0.9, random_state=42, 
                                    loss='ls', verbose=2).fit(X_train, y_train)
   

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.6068           0.0535           13.09m
         2           0.5653           0.0430           12.90m
         3           0.5307           0.0345           12.80m
         4           0.4999           0.0287           12.61m
         5           0.4779           0.0226           12.45m
         6           0.4569           0.0198           12.37m
         7           0.4404           0.0167           12.26m
         8           0.4257           0.0145           12.18m
         9           0.4135           0.0122           12.10m
        10           0.4039           0.0097           12.03m
        11           0.3949           0.0087           11.95m
        12           0.3873           0.0077           11.89m
        13           0.3798           0.0071           11.82m
        14           0.3719           0.0062           11.75m
        15           0.3671           0.0048           11.63m
       

In [7]:
    mse = mean_squared_error(y_cross, predict_transformed(X_cross, est))
    mae = mean_absolute_error(y_cross, predict_transformed(X_cross, est))
    
    print("MSE  :   {}".format(mse))
    print("MAE  :   {}".format(mae))


MSE  :   3576280.78103
MAE  :   1142.24951823


In [8]:

id_test = df_test.id.values
    
X_test = transform(df_test, cross=False, scaler=scaler)
    
pred_test = predict_transformed(X_test, est)
    
submission = []
for i in range(0, len(pred_test)):
    submission.append([id_test[i], pred_test[i]])
    

In [9]:
sub =  pd.DataFrame(data=submission, columns=['id', 'loss'])

In [10]:
type(sub)

pandas.core.frame.DataFrame

In [11]:
sub.shape

(125546, 2)

In [12]:
sub.head(5)

Unnamed: 0,id,loss
0,4,1478.118796
1,6,1761.608038
2,9,9209.780647
3,12,5170.003789
4,15,833.722281


In [13]:
sub.to_csv('submission.csv', index=False)

In [14]:
sub_test = pd.read_csv('submission.csv')

In [15]:
sub_test.head(5)

Unnamed: 0,id,loss
0,4,1478.118796
1,6,1761.608038
2,9,9209.780647
3,12,5170.003789
4,15,833.722281


In [29]:
type(sample_submission)

pandas.core.frame.DataFrame

In [30]:
sample_submission.head(5)

Unnamed: 0,id,loss
0,4,0
1,6,0
2,9,0
3,12,0
4,15,0
