# Imports

In [1]:
from __future__ import print_function 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from tqdm import trange, tqdm
import pickle

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (5, 5)

%load_ext autoreload
%autoreload 2

from hackathon.utils.utils import *
from hackathon.utils.draw_utils import *
import pandas as pd

# Fetching data

In [2]:
test_data = parse_data("../data/test_data.csv")
train_data = parse_data("../data/train_data.csv")

# Splitting train_data into train / validation dataset

## You should test how your model performs on data that was not seen during training
### It was the "first rule of proper machine learning"
### Purpose of validation data is choosing best model and parameters

In [3]:
train_dates = train_data.index < '2013-01-01 00:00:00'

train_df = train_data[train_dates]
valid_df = train_data[~train_dates]

train_df.shape, valid_df.shape

((37928, 59), (17447, 59))

# Extract X and y for train/valid/test set

In [4]:
x_columns = filter(lambda x: "NPWD" not in x and 'ut_ms' not in x, train_df.columns)
print(x_columns)
print()
y_columns = filter(lambda x: "NPWD"  in x, train_df.columns)
print(y_columns)

# create train/valid datasets
X_train = train_df[x_columns]
y_train = train_df[y_columns]

X_valid = valid_df[x_columns]
y_valid = valid_df[y_columns]

X_test = test_data[x_columns]
y_test = test_data[y_columns]

['sunmars_km', 'earthmars_km', 'sunmarsearthangle_deg', 'solarconstantmars', 'eclipseduration_min', 'occultationduration_min', 'sa', 'sx', 'sy', 'sz', 'dmop_count_24h_AAAA', 'dmop_count_24h_AACF', 'dmop_count_24h_ADMC', 'dmop_count_24h_AHHH', 'dmop_count_24h_AMMM', 'dmop_count_24h_AOOO', 'dmop_count_24h_APSF', 'dmop_count_24h_APWF', 'dmop_count_24h_ASEQ', 'dmop_count_24h_ASSS', 'dmop_count_24h_ASXX', 'dmop_count_24h_ATMB', 'dmop_count_24h_ATTT', 'dmop_count_24h_AVVV', 'dmop_count_24h_AXXX', 'dmop_count_24h_sum']

['NPWD2372', 'NPWD2401', 'NPWD2402', 'NPWD2451', 'NPWD2471', 'NPWD2472', 'NPWD2481', 'NPWD2482', 'NPWD2491', 'NPWD2501', 'NPWD2531', 'NPWD2532', 'NPWD2551', 'NPWD2552', 'NPWD2561', 'NPWD2562', 'NPWD2691', 'NPWD2692', 'NPWD2721', 'NPWD2722', 'NPWD2742', 'NPWD2771', 'NPWD2791', 'NPWD2792', 'NPWD2801', 'NPWD2802', 'NPWD2821', 'NPWD2851', 'NPWD2852', 'NPWD2871', 'NPWD2872', 'NPWD2881', 'NPWD2882']


# Fit a simple linear model (linear regression) on train data

In [6]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Check your score on validation data

## We are checking how the model performs on validation data

In [8]:
from hackathon.utils.utils import RMSE
RMSE(linear_model.predict(X_valid), y_valid)

0.11585583530285432

# Fit a maybe-better-model (a regularized linear model - more resistant to overfitting)

In [9]:
from sklearn.linear_model import ElasticNet

In [10]:
maybe_better_linear_model = ElasticNet()
maybe_better_linear_model.fit(X_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

# Evaluate it

In [11]:
RMSE(maybe_better_linear_model.predict(X_valid), y_valid)

0.11685221210036432

### It was worse :<< so sad

# Train the better performing model on (train + valid) data

In [12]:
best_model = LinearRegression()
X_train_valid = pd.concat([X_train, X_valid])
y_train_valid = pd.concat([y_train, y_valid])
best_model.fit(X_train_valid, y_train_valid)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Create a submission file which you will upload later

In [13]:
def create_submission_df(model, X_test_features, y_columns): 
    X_test = X_test_features
    y_test_pred = model.predict(X_test_features)
    submision_df = pd.DataFrame(data=y_test_pred, index=X_test.index, columns=y_columns)
    submision_df.index = to_utms(submision_df.index)
    return submision_df


sub_df = create_submission_df(best_model, X_test[x_columns], y_columns)
sub_df.to_csv("../submissions/my_first_model_prediction.csv")