In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
#########################
# Transform data actions
#########################
def transform(df, cross=True, scaler=None):
    df = df.drop('id', axis=1)
    df = cat_to_cont(df)

    if cross:
        y = df.loss.values
        X = df.drop('loss', axis=1)
        X_train, X_cross, y_train, y_cross = train_test_split(X, y, test_size=0.2, random_state=42)

        scaler = preprocessing.StandardScaler().fit(X_train)

        X_train = scaler.transform(X_train)
        X_cross = scaler.transform(X_cross)

        y_train = np.log(y_train)

        return X_train, X_cross, y_train, y_cross, scaler
    else:
        X_test = cat_to_cont(df)
        X_test = X_test.as_matrix()

        X_test = scaler.transform(X_test)

        return X_test


In [3]:
#######################################
# Convert categorical to cont features
#######################################
def cat_to_cont(df):
    # Get categorical columns range
    for i in range(1, 117):
        col_name = "cat{}".format(i)
        df[col_name] = df[col_name].astype('category')

    # Convert categorical to cont
    cat_cols = df.select_dtypes(['category']).columns
    df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

    return df

In [4]:
def predict_transformed(X, est):
    return np.exp(est.predict(X))

In [5]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [16]:
X_train, X_cross, y_train, y_cross, scaler = transform(df)
    
est = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=5,
                                    subsample=0.9, random_state=42, 
                                    loss='ls', verbose=2).fit(X_train, y_train)
   

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.6095           0.0512           17.57m
         2           0.5704           0.0407           17.69m
         3           0.5383           0.0325           17.93m
         4           0.5092           0.0270           17.70m
         5           0.4881           0.0219           17.86m
         6           0.4673           0.0194           17.76m
         7           0.4514           0.0161           17.78m
         8           0.4372           0.0140           17.66m
         9           0.4256           0.0117           17.64m
        10           0.4166           0.0094           17.64m
        11           0.4080           0.0084           17.49m
        12           0.4006           0.0077           17.37m
        13           0.3937           0.0065           17.28m
        14           0.3868           0.0059           17.10m
        15           0.3813           0.0055           16.98m
       

In [17]:
    mse = mean_squared_error(y_cross, predict_transformed(X_cross, est))
    mae = mean_absolute_error(y_cross, predict_transformed(X_cross, est))
    
    print("MSE  :   {}".format(mse))
    print("MAE  :   {}".format(mae))


MSE  :   3607452.92264
MAE  :   1140.82351385


In [18]:

id_test = df_test.id.values
    
X_test = transform(df_test, cross=False, scaler=scaler)
    
pred_test = predict_transformed(X_test, est)
    
submission = []
for i in range(0, len(pred_test)):
    submission.append([id_test[i], pred_test[i]])
    

In [19]:
sub =  pd.DataFrame(data=submission, columns=['id', 'loss'])

In [20]:
type(sub)

pandas.core.frame.DataFrame

In [21]:
sub.shape

(125546, 2)

In [22]:
sub.head(5)

Unnamed: 0,id,loss
0,4,1461.764612
1,6,1736.581809
2,9,10019.936969
3,12,5669.986318
4,15,833.44809


In [23]:
sub.to_csv('submission.csv', index=False)

In [24]:
sub_test = pd.read_csv('submission.csv')

In [25]:
sub_test.head(5)

Unnamed: 0,id,loss
0,4,1461.764612
1,6,1736.581809
2,9,10019.936969
3,12,5669.986318
4,15,833.44809


In [29]:
type(sample_submission)

pandas.core.frame.DataFrame

In [30]:
sample_submission.head(5)

Unnamed: 0,id,loss
0,4,0
1,6,0
2,9,0
3,12,0
4,15,0
