# train.py Validation

## Objective

The purpose of this notebook is to validate the correct execution of the code included in the train.py file.

## Import Libraries

In [1]:
#----- These libraries were provided in the original code
#----- They were sorted by type

#- General Libraries
import argparse
import os
import joblib
import numpy as np
import pandas as pd

#- sklearn libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#- Azure ML libraries
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

#- This library was included to load the dataset
from azureml.core import Dataset

## Code

In [None]:
#----- The original code of this section was relocated to the last part of the script,
#----- since the clean_data function was called before it was defined

### Clean data

In [2]:
#- This function was provided in the original code
#- It performs data cleansing operations such as:
#--- Drop null values
#--- Transformation of categorical values to numerical values (e.g. month name to month number, no/yes to 0/1)
#--- One hot encoding for features with many values (e.g. job, education)

def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    
    #- This code was added because the original function was not returning expected values
    return x_df, y_df

In [None]:
#- This function was provided in the original code
#- It will be called in the udacity-project notebook to test Hyperparameter tuning and Auto ML approaches

def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))


### Load dataset

In [3]:
#- Define path to data
web_path ='https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'

In [4]:
#- Create tabular dataset
ds = Dataset.Tabular.from_delimited_files(path=web_path)

### Clean dataset

In [5]:
#- Call function to clean dataset
#- It performs categorical to numerical transformations an one-hot encoding
x, y = clean_data(ds)

In [6]:
#- Sanity check
x.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
0,57,1,0,0,1,5,1,371,1,999,...,1,0,0,0,0,1,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,0,1,0,0,0,0,0,0,0,1
2,33,1,0,0,0,5,5,52,1,999,...,1,0,0,0,1,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,0,1,0,0,0,1,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,1,0,0,0,0,1,0,0,0,0


In [7]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

### Analysis of data features returned by the clean_data function

In [8]:
#- Get information about the features dataset
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 0 to 32949
Data columns (total 39 columns):
age                              32950 non-null int64
marital                          32950 non-null int64
default                          32950 non-null int64
housing                          32950 non-null int64
loan                             32950 non-null int64
month                            32950 non-null int64
day_of_week                      32950 non-null int64
duration                         32950 non-null int64
campaign                         32950 non-null int64
pdays                            32950 non-null int64
previous                         32950 non-null int64
poutcome                         32950 non-null int64
emp.var.rate                     32950 non-null float64
cons.price.idx                   32950 non-null float64
cons.conf.idx                    32950 non-null float64
euribor3m                        32950 non-null float64
nr.employed        

In [9]:
#- Get statistics of the dataset
x.describe()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
count,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,...,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0
mean,40.040212,0.605948,9.1e-05,0.522974,0.151806,6.605281,2.980789,257.335205,2.56173,962.17478,...,0.63569,0.36431,0.101153,0.056055,0.147496,0.229226,0.000455,0.128346,0.294901,0.042367
std,10.432313,0.488653,0.009542,0.499479,0.358838,2.041099,1.41158,257.3317,2.763646,187.646785,...,0.481243,0.481243,0.301536,0.230031,0.354605,0.420341,0.021332,0.33448,0.456005,0.201429
min,17.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,0.0,0.0,0.0,0.0,5.0,2.0,102.0,1.0,999.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,38.0,1.0,0.0,1.0,0.0,6.0,3.0,179.0,2.0,999.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,47.0,1.0,0.0,1.0,0.0,8.0,4.0,318.0,3.0,999.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,98.0,1.0,1.0,1.0,1.0,12.0,5.0,4918.0,56.0,999.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Analysis of data cleansing performed on the dataset

| Feature        | Change |
| :------------- | :---   |
| age            | No changes |
|  job           | One-hot encoding (columns with job_ prefix) |
| marital        | Changed to 0/1 |
| education      | One-hot encoding (clumns with education_ prefix) |
| default        | Changed to 0/1 |
| housing        | Changed to 0/1 |
| loan           | Changed to 0/1 |
| contact        | One-hot encoding (columns with contact_ prefix) |
| month          | Changed to numerical values (1 to 12)
| day_of_week    | Changed to numerical values (1 to 7)
| duration       | No change |
| campaign       | No change |
| pdays          | No change |
| previous       | No change |
| poutcome       | Changed to 0/1 |
| emp.var.rate   | No change |
| cons.price.idx | No change |
| cons.conf.idx  | No change |
| euribor3m      | No change |
| nr.employed    | No change |
| y              | Changed to 0/1 |

### Split train and tests datasets

In [10]:
#- Split dataset into train and test sets
#- Split into 70-30 proportion, since it is the general recommended value in the field
#- Set random_state to 0, to ensure that the same random combination is used between runs
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)

In [11]:
#- Sanity check
x_train.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
6986,31,1,0,0,0,8,2,551,2,999,...,1,0,0,0,0,0,0,0,1,0
664,30,1,0,0,0,6,2,215,10,999,...,0,1,0,1,0,0,0,0,0,0
9852,29,0,0,1,0,5,3,83,2,999,...,0,1,0,0,1,0,0,0,0,0
22102,27,1,0,1,1,7,1,166,3,999,...,1,0,0,0,1,0,0,0,0,0
13963,39,1,0,0,0,6,3,305,4,999,...,0,1,1,0,0,0,0,0,0,0


In [12]:
y_train.head()

6986     0
664      0
9852     0
22102    0
13963    0
Name: y, dtype: int64

In [13]:
x_test.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_cellular,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown
5379,25,1,0,0,0,6,3,76,1,999,...,0,1,0,0,1,0,0,0,0,0
13301,50,1,0,0,0,8,4,23,14,999,...,1,0,0,0,1,0,0,0,0,0
15089,36,1,0,0,0,5,5,363,8,999,...,0,1,0,0,1,0,0,0,0,0
8998,32,1,0,1,0,8,2,66,7,999,...,0,1,0,0,0,0,0,0,1,0
18820,36,0,0,0,1,6,4,385,1,999,...,0,1,0,0,0,0,0,1,0,0


In [14]:
y_test.head()

5379     0
13301    0
15089    0
8998     0
18820    0
Name: y, dtype: int64

### Train model

The model used is Logistic Regression provided by sklearn. 
Logistic regression is a linear model for classification. The probabilities describing the possible outcomes of a single trial are modeled using a logistic function.  

The parameters used for the training process are:
* C: Inverse of regularization strength. Smaller values specify strong regularization
  * Regularization tries to reduce or penalize the complexity of the model
  * A strong regularization means that a higher bias is induced in the model, so it will tend generalize and underfit
  * A weak regularization means that a lower bias is induced in the model, so it will tend to accommodate the variations in the feature space and overfit
* max_iter: maximum number of iterations taken for the solvers to converge

In [15]:
#- Use sklearn Logistic Regression
#- max iter was set to 5000, since lower values produced a ConvergenceWarning error
model = LogisticRegression(C=1.0, max_iter=5000).fit(x_train, y_train)

#- Get accuracy
accuracy = model.score(x_test, y_test)
accuracy

0.9163378856853819

### Test different values of C

In [16]:
model = LogisticRegression(C=0.01, max_iter=5000).fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
accuracy

0.9151239251390997

In [17]:
model = LogisticRegression(C=0.1, max_iter=5000).fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
accuracy

0.9155285786545271

In [18]:
model = LogisticRegression(C=10, max_iter=5000).fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
accuracy

0.9167425392008093

In [19]:
model = LogisticRegression(C=100.0, max_iter=5000).fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
accuracy

0.9161355589276682

### Test different values of max iterations

In [20]:
model = LogisticRegression(C=1.0, max_iter=10000).fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
accuracy

0.9163378856853819

In [21]:
model = LogisticRegression(C=1.0, max_iter=15000).fit(x_train, y_train)
accuracy = model.score(x_test, y_test)
accuracy

0.9163378856853819

## AutoML code validation

This code was added, since the original eliminates the label column in the input dataset.  
AutoML requieres an input dataset with both features and label columns

In [22]:
#- Concatenate input dataset and label dataset
x_ml = pd.concat ([x, y], axis=1)

In [23]:
#- Sanity check
x_ml.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0


In [24]:
#- Get information about the features dataset
x_ml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 0 to 32949
Data columns (total 40 columns):
age                              32950 non-null int64
marital                          32950 non-null int64
default                          32950 non-null int64
housing                          32950 non-null int64
loan                             32950 non-null int64
month                            32950 non-null int64
day_of_week                      32950 non-null int64
duration                         32950 non-null int64
campaign                         32950 non-null int64
pdays                            32950 non-null int64
previous                         32950 non-null int64
poutcome                         32950 non-null int64
emp.var.rate                     32950 non-null float64
cons.price.idx                   32950 non-null float64
cons.conf.idx                    32950 non-null float64
euribor3m                        32950 non-null float64
nr.employed        