In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import model_selection, preprocessing

try:
    df_train = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
    df_test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
    df_macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])
    print ("Training Data Loaded with {} samples and {} features".format(*df_train.shape)) 
    print ("Testing Data Loaded with {} samples and {} features".format(*df_test.shape)) 
    print ("Macro Data Loaded with {} samples and {} features".format(*df_macro.shape))
except:
    print ("Oh snap!")

# Round 1) Running raw data through XGB to generate baseline score
### Score: 0.32305

In [None]:
# Deal with categorical features
for f in df_train.columns:
    if df_train[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_train[f].values)) 
        df_train[f] = lbl.transform(list(df_train[f].values))       
        
# Set data as DMatrix
train_y = df_train.price_doc.values
train_X = df_train.drop(["id", "timestamp", "price_doc"], axis=1)

XGB_Train = xgb.DMatrix(train_X,label=train_y,feature_names=train_X.columns.values)

# Set parameters
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Create model
model = xgb.train(xgb_params, XGB_Train, num_boost_round=100)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
# Deal with categorical features in test data
for f in df_test.columns:
    if df_test[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_test[f].values)) 
        df_test[f] = lbl.transform(list(df_test[f].values))  

# Run raw test data through model to generate baseline predictions
test = df_test.drop(["id", "timestamp"], axis=1)
XGB_Test = xgb.DMatrix(test)

# Generate baseline predictions
ypred = model.predict(XGB_Test)

# Merge predictions with ID's to create submission
idCol = df_test["id"].values
if len(ypred) == len(idCol):
    sub = np.column_stack((idCol,ypred))
    df_sub = pd.DataFrame(data=sub,columns=["id","price_doc"])
    df_sub = df_sub.astype(int)
    print("Final output:")
    print(df_sub.head())
else:
    print("Oh snap!")

# Generating first submission and submitting for baseline score
df_sub.to_csv('submission1.csv', index=False)

# Score = 0.32305

# Round 2) Joining macro data and doing some basic engineering
### Score: 0.34993

In [None]:
# Deal with categorical features in macro data
for f in df_macro.columns:
    if df_macro[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_macro[f].values)) 
        df_macro[f] = lbl.transform(list(df_macro[f].values)) 

# Joining macro data and combining test/train into single dataframe
num_train = len(df_train)
train_labels = df_train[['id','price_doc']]
df_trainingFeatures = df_train.drop(['price_doc'], axis=1)
df_all = pd.concat([df_trainingFeatures, df_test])

df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')

In [None]:
# Deal with categorical features in macro data
for f in df_macro.columns:
    if df_macro[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_macro[f].values)) 
        df_macro[f] = lbl.transform(list(df_macro[f].values)) 

# Joining macro data and combining test/train into single dataframe
num_train = len(df_train)
train_labels = df_train[['id','price_doc']]
df_trainingFeatures = df_train.drop(['price_doc'], axis=1)
df_all = pd.concat([df_trainingFeatures, df_test])

df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')

# ------
# Feature engineering the timestamp a bit

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Splitting back in to test & training sets
df_train = df_all[:num_train]
df_test = df_all[num_train:]

# Adding price_doc back to training set to remove outliers
df_train = pd.merge_ordered(df_train, train_labels, on='id', how='left')

startingRows = df_train.shape[0]
startingColumns = df_train.shape[1]

# removing outlier rows from training set
top = df_train["price_doc"].quantile(0.97)
bottom = df_train["price_doc"].quantile(0.03)
df_train = df_train[df_train["price_doc"] < top]
df_train = df_train[df_train["price_doc"] > bottom]

# ------
# Creating new model

# Set data as DMatrix
train_y = df_train.price_doc.values
train_X = df_train.drop(["id", "timestamp", "price_doc"], axis=1)

XGB_Train = xgb.DMatrix(train_X,label=train_y,feature_names=train_X.columns.values)

# Set parameters
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Create model
model = xgb.train(xgb_params, XGB_Train, num_boost_round=100)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()


In [None]:
# Run test data through model to generate a second round of predictions
test = df_test.drop(["id", "timestamp"], axis=1)
XGB_Test = xgb.DMatrix(test)

# Generate baseline predictions
ypred = model.predict(XGB_Test)

# Merge predictions with ID's to create submission
idCol = df_test["id"].values
if len(ypred) == len(idCol):
    sub = np.column_stack((idCol,ypred))
    df_sub = pd.DataFrame(data=sub,columns=["id","price_doc"])
    df_sub = df_sub.astype(int)
    print("Final output:")
    print(df_sub.head())
else:
    print("Oh snap!")

# Generating second submission and submitting for score
df_sub.to_csv('submission2.csv', index=False)

# Score = 0.34993
# This is actually worse... Probably need to clean up the data a bit.