In [None]:
## Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
macro = pd.read_csv('../input/macro.csv')
y = np.log(train.price_doc)

In [None]:
train = pd.merge(train, macro, how='left', on='timestamp')
train = train.drop(['id', 'price_doc', 'timestamp'], axis=1)
train = pd.get_dummies(train)

test = pd.merge(test, macro, how='left', on='timestamp')
test = test.drop(['timestamp'], axis=1)
test = pd.get_dummies(test)

#feature pre-processing
percent_null = train.isnull().mean(axis=0) > 0.20
train = train.loc[:, ~percent_null]


In [None]:
#splitting
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.2)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

In [None]:
#Measuring functions
def rmsle_exp(y_true_log, y_pred_log):
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)
    return np.sqrt(np.mean(np.power(np.log(y_true + 1) - np.log(y_pred + 1), 2)))

def score_model(model, pipe):
    train_error = rmsle_exp(y_train, model.predict(pipe.transform(X_train)))
    test_error = rmsle_exp(y_test, model.predict(pipe.transform(X_test)))
    return train_error, test_error

In [None]:
pipe = make_pipeline(Imputer(), StandardScaler())
pipe.fit(X_train)

In [None]:
lr = LinearRegression(fit_intercept=True)
lr.fit(pipe.transform(X_train), y_train)
print("Train error: {:.4f}, Test error: {:.4f}".format(*score_model(lr, pipe)))

In [None]:
#submission
# Make sure it's in the same format as the training data
df_test = pd.DataFrame(columns=train.columns)
for column in df_test.columns:
    if column in test.columns:
        df_test[column] = test[column]
    else:
        df_test[column] = np.nan

# Make the predictions
predictions = np.exp(lr.predict(pipe.transform(df_test)))

# And put this in a dataframe
predictions_df = pd.DataFrame()
predictions_df['id'] = test['id']
predictions_df['price_doc'] = predictions
predictions_df.head()

In [None]:
null

In [None]:
# Now, output it to CSV
predictions_df.to_csv('predictions.csv', index=False)