In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score

import xgboost as xgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [None]:
print('Size of training set: {} rows and {} columns'.format(*train_df.shape))
train_df.head()

In [None]:
cols = [c for c in train_df.columns if 'X' in c]
single_val = []

newcols = []

for c in cols:
    print("Feature: ", c)
    vals = train_df[c].unique()
    print(vals)
    if vals.size == 1:
        print("---------------> Single value")
        single_val.append(c)
    else:
        newcols.append(c)
        
print(single_val)

Some features have only one value. We may drop them as they would'nt have any impact:
['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347'] 

In [None]:
train_df.drop(single_val, axis=1, inplace=True)

**Data preparation**
--------------------

In [None]:
# Save target column
# IDs do not correspond to line number, so we need to save it as well
train_y = train_df['y']
train_id = train_df['ID']

train_df = train_df.drop("y", 1)
train_df = train_df.drop("ID", 1)

In [None]:
print('Feature types:')
train_df[newcols].dtypes.value_counts()

In [None]:
# Data need to be int or float, but we have int and object
# We use the LabelEncoder function for that

label_encoder = LabelEncoder()

for c in newcols:
    typ = train_df[c].dtype
    if typ != np.int64:
        label_encoder = label_encoder.fit(train_df[c])
        train_df[c] = label_encoder.transform(train_df[c])
        

In [None]:
# Let's check the feature types now
print('Feature types:')
train_df[newcols].dtypes.value_counts()

In [None]:
X_dtrain, X_test, y_dtrain, y_test = train_test_split(train_df, train_y, random_state=7, test_size=0.3)
dtrain = xgb.DMatrix(X_dtrain, label=y_dtrain)

In [None]:
params = {"objective": "reg:linear", "booster":"gblinear", "max_depth":"4", "nb_estimator":"1000"}
model_xgb = xgb.train(dtrain=dtrain,params=params)

In [None]:
y_pred = model_xgb.predict(dtrain)

In [None]:
mean_squared_error(y_dtrain, y_pred)

Random Forest
-------------

In [None]:
model_rfr = RandomForestRegressor(n_estimators=100, max_features='log2').fit(X_dtrain, y_dtrain)

In [None]:
y_pred_rfr = model_rfr.predict(X_dtrain)
mean_squared_error(y_dtrain, y_pred_rfr)

In [None]:
importances = model_rfr.feature_importances_
features = pd.DataFrame()
features['feature'] = train_df.columns
features['importance'] = importances

features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)

features.plot(kind='barh', figsize=(5, 60))

In [None]:
# Trying to remove some features
todrop = features.loc[features['importance'] < 0.000331].index.tolist()
train_df = train_df.drop(todrop, 1)

In [None]:
print('Size of training set: {} rows and {} columns'.format(*train_df.shape))

In [None]:
model_rfr = RandomForestRegressor(n_estimators=100, max_features='log2').fit(X_dtrain, y_dtrain)
y_pred_rfr = model_rfr.predict(X_dtrain)
mean_squared_error(y_dtrain, y_pred_rfr)

GradientBoostingRegressor
-------------------------

In [None]:
model_gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(X_dtrain, y_dtrain)

In [None]:
y_pred_gbr = model_gbr.predict(X_dtrain)
mean_squared_error(y_dtrain, y_pred_gbr)

Ensembling
----------

In [None]:
y_glob = (y_pred_rfr + y_pred_gbr + y_pred)/3
mean_squared_error(y_dtrain, y_glob)

Predictions from test_df
------------------------

In [None]:
test_id = test_df['ID']
test_df = test_df.drop("ID", 1)

In [None]:
test_df.drop(single_val, axis=1, inplace=True)

In [None]:
label_encoder = LabelEncoder()

for c in newcols:
    typ = test_df[c].dtype
    if typ != np.int64:
        label_encoder = label_encoder.fit(test_df[c])
        test_df[c] = label_encoder.transform(test_df[c])

In [None]:
d_test = xgb.DMatrix(test_df)

In [None]:
y_test1 = model_xgb.predict(d_test)
y_test2 = model_rfr.predict(test_df)
y_test3 = model_gbr.predict(test_df)

In [None]:
submission = pd.DataFrame({
        "ID": test_id,
        "y": (y_test1 + y_test2 + y_test3)/3
    })
submission.to_csv('mercedes4.csv', index=False)