# Install Time Prediction Baseline
Example notebook for reading and transforming install prediction data.

Version 0.1.1
(convert to python stand-alone with `jupyter nbconvert --to python baseline.ipynb`)

In [None]:
# ===============LICENSE_START=======================================================
# Apache-2.0
# ===================================================================================
# Copyright (C) 2019 AT&T Intellectual Property  All rights reserved.
# ===================================================================================
# This software file is distributed by AT&T
# under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# This file is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===============LICENSE_END=========================================================
# Eric Zavesky, 03/22/19

import pandas as pd  # data read
import numpy as np
from sklearn import preprocessing  # data ETL
from sklearn.model_selection import train_test_split   # balanced partioning
import os,sys  # file checks
import pickle   # compressed results
import gzip  # compression 
import yaml   # configuration file

from sklearn.feature_extraction import text  # text processing
from sklearn import ensemble  # random forest
from sklearn import metrics  # final scoring
from sklearn.model_selection import cross_validate, GridSearchCV  # training


# Configuration Options

It's handy to include configuration options in a standard file that can be quickly modified and rerun if you're training something new.  Of course, you can always use command-line configurations as well, but a handy set of defaults in a human-readable file might be a bit easier when you're running things in notebooks.

Here, we're using a simple [YAML](https://camel.readthedocs.io/en/latest/yamlref.html) file for our options which is human-readable, allows comments, and is well supported by other languages.

To modify this program's operation, just open the file `config.yaml` in your editor of choice and rerun this script.

In [None]:
config_path = 'config.yaml'
if not os.path.isfile(config_path):
    print("Sorry, can't find the configuration file {}, aborting.".format(config_path))
    sys.exit(-1)
config = yaml.safe_load(open(config_path))

# Data Exploration
First, let's load our data to see if we need to perform any transform operations.  We will load and parse into rows and columns using the [pandas read_csv function](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) to return a standardized [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). 

Of course, you could use whatever library or load function you're used to, but these dataframes have nice interoperability properties with other libraries for learning and manipulation.

In [None]:
if not os.path.isfile(config["path"]["train_data"]):
    print("Sorry, can't find a raw input file ({} or {}), aborting.".format(config["path"]["train_data"]))
    sys.exit(-1)

df = {}
df_train = pd.read_csv(config["path"]["train_data"])
# set the index
df_train.set_index("row_id", drop=True, inplace=True)

print("Dimensionality of Training: {}".format(df_train.shape))
print(df_train.sample(5))


# Data Preprocessing
Preprocessing for this data will like convert some of our textual columns (`capbucket` and `capcat`) into numerical values.  Additionally, we can make sense of the `date` field with some other seasonal manipulationls.  Finally, since we're using `row_id` simply as a sample index, let's update our dataframe. 


In [None]:
models = {}
models["capbucket"] = preprocessing.OrdinalEncoder() #text.CountVectorizer(tokenizer=lambda x: x.split("_"))
models["capcat"] = text.CountVectorizer()


# our main preprocessing function
def preproc_data(models, df_test, df_train=None):
    print("Test dimensionality before processing {}".format(df_test.shape))

    if df_train is not None:
        # convert categorical/fixed text into numbers
        # peel off the STATE, CITY, REGION from the bucket
        tmp_feat = df_train["capbucket"].str.split("_", 3).values.tolist()
        models["capbucket"].fit(tmp_feat)
        models["capcat"].fit(df_train["capcat"])

    # convert the date into something more reasonable
    print("Preprocessing vectorized 'date' ...")
    df_time = pd.to_datetime(df_test["date"], format='%Y-%m-%d')
    df_test["weekday"] = df_time.dt.dayofweek   # grab the day of week
    df_test["day"] = df_time.dt.day   # grab the day of week
    df_test["year"] = df_time.dt.year   # grab the day of week
    df_test["month"] = df_time.dt.month   # grab the month of the year
    del df_test["date"]
    print("... dimensionality after date processing {}".format(df_test.shape))
        
    print("Preprocessing categorical data ...")
    # handle text encoding with categorical
    tmp_feat = df_test["capbucket"].str.split("_", 3).values.tolist()
    tmp_feat = models["capbucket"].transform(tmp_feat)
    col_tmp = ["loc_{}".format(x) for x in range(len(tmp_feat[0]))]
    tmp_encode = pd.DataFrame(tmp_feat,  columns=col_tmp, index=df_test.index)
    df_test = pd.concat([df_test, tmp_encode], axis=1, sort=False)
    del df_test["capbucket"]
    print("... dimensionality after location processing {}".format(df_test.shape))
    # job encoding by term
    col_tmp = ["job_{}".format(x) for x in models["capcat"].get_feature_names()]
    tmp_encode = pd.DataFrame(models["capcat"].transform(df_test["capcat"]).toarray(),  
                              columns=col_tmp, index=df_test.index)
    df_test = pd.concat([df_test, tmp_encode], axis=1, sort=False)
    del df_test["capcat"]
    print("... dimensionality after job type processing {}".format(df_test.shape))

    # return trained/used model and test data
    df_test = df_test.astype(float)
    return models, df_test

# pull out the target first, then run preprocessing
Y = df_train["dailyCountSum"].astype(float)
X = df_train.copy()
del X["dailyCountSum"]
    
# do processing
# models, X = preproc_data(models, X, df["X_train"])  # what you SHOULD do for independent test samples
models, X = preproc_data(models, X, X.copy())  # what we do, to use ALL of the training data
# take a peek at our data now
print(X.sample(5))


## Normalization and Scaling
Let's take advantage of a [standard scaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) to get things into shape and between the ranges of `[0,1]`.

After scaling, we quickly visualize the training samples and see that most of our features are reaonsable, except thos that have a flat/empty box.  Also, we can below that fields like `day` and `month` are numerically reduced to a range that is better for most classifiers to handle.

In [None]:
# train and scale the train data in one pass
if "scaler" not in models:  # avoid repeat run/train
    models["scaler"] = preprocessing.StandardScaler()
    X = pd.DataFrame(models["scaler"].fit_transform(X), columns=X.columns, index=X.index)
# visualize the ranges of a few columns of data
plot = X.boxplot(figsize=(12,6), rot=90)

In [None]:
# train models
if config["training"]["model"]=="rf":  # which classifier to try?
    models["primary"] = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=3)
    models["param"] = {"n_estimators":(1000,), "max_depth":(3,)} #{"n_estimators":(50, 100, 200), "max_depth":(2,5)}
elif config["training"]["model"]=="gbm":
    models["primary"] = ensemble.GradientBoostingRegressor(n_estimators=500, max_depth=6,learning_rate=0.3, max_features=3)
    models["param"] = {"n_estimators":(500,), "learning_rate":(0.3,),"max_features": (3,), "max_depth":(6,) } # {"n_estimators":(50, 100, 200), "max_features": (2,), "learning_rate":(0.1, 0.2)}
else:
    raise Exception("Unknown training model {}, aborting now".format(config["training"]["model"]))
clf = GridSearchCV(models["primary"], models["param"], cv=3, verbose=2, 
                   #n_jobs=1, scoring=config["training"]["scoring"], 
                   n_jobs=config["training"]["threads"], scoring=config["training"]["scoring"], 
                   refit=config["training"]["scoring"][0], return_train_score=True)
print("Executing grid search ({})".format(clf))
scores = clf.fit(X, Y)
models["best"] = clf.best_estimator_
print(clf.cv_results_)


In [None]:
print(models["best"])
for t in ["train", "test"]:
    k = "mean_{}_{}".format(t, config["training"]["scoring"][0])
    print("{}: {}".format(k, clf.cv_results_[k]))


# Evaluating Test Data
The mean performance scores above are helpful, but they may not exactly map to performance on a test set, but don't be discouraged. 

The last thing to do is evaluate on test data, which generally has these steps.
1. Load the test data
2. Preprocess the raw data to generate new features
3. Evaluate the preprocessed features for direct evaluation

In [None]:
# finally, apply it on test data
df_test = pd.read_csv(config["path"]["test_data"])
# set the index
df_test.set_index("row_id", drop=True, inplace=True)

# do processing + prediction
models, df_test = preproc_data(models, df_test)
# normalize feature values
df_test = pd.DataFrame(models["scaler"].fit_transform(df_test), columns=df_test.columns, index=df_test.index)
# predict values
nd_predict = models["best"].predict(df_test)
print("Writing predictions to '{}'".format(config["path"]["test_predictions"]))
pd.DataFrame(nd_predict, columns=["dailyCountSum"], index=df_test.index).to_csv(
    config["path"]["test_predictions"], header=True,  index=True)

# write results to file
if os.path.isfile(config["path"]["test_labels"]):
    print("=== Special admin review with TEST data === ")
    df_labels = pd.read_csv(config["path"]["test_labels"])
    df_labels["predict"] = nd_predict
    print(df_labels)
    print("Test Score: {}".format(metrics.mean_squared_error(
        df_labels["dailyCountSum"], df_labels["predict"])))
    

# Time to Submit!
All done with evaluating the data and writing your predictions.  They should be saved in the `config["path"]["test_predictions"]` file (by default, it's defined as `data/ss-2019-iefs-install-test_predictions.csv`).  Just pick up that file and submit it through the asessment interface.

In [None]:

# example code to shuffle from manicured ground truth!
def gt_shuffle():
    # read the test data and shift the index
    df_test = pd.read_csv(config["path"]["test_data"])
    df_test.set_index("row_id", drop=True, inplace=True)
    # finally, apply it on test data
    from sklearn import utils
    df_test = utils.shuffle(df_test)
    Y = df_test["dailyCountSum"].astype(float)
    del df_test["dailyCountSum"]
    # save out shuffled label data
    Y.to_csv(config["path"]["test_labels"], header=True, index=True)
    df_test.to_csv(config["path"]["test_data"], header=True, index=True)
#gt_shuffle()
