# Python scikit-learn Machine Learning Workflow 

'''
Python scikit-learn Machine Learning Workflow

In this 1.5 h session you will learn how to build an efficient, reusable workflow that starts with a pandas DataFrame and ends with a trained scikit-learn linear regression model.

Pandas
-	Overview of how to use pandas to read, describe, clean and prepare a dataset for machine learning
scikit-learn
-	Handle missing data
-	Create training and testing datasets
-	Impute missing values
-	Perform linear regression for a dependent variable of type float (target) and independent variables of type float or integer (features)
-	Create a workflow pipeline
-	Perform n-fold cross validation on the entire pipeline
-	Tune the hyperparameters of the model using grid search
-	Perform feature selection
'''

In [1]:
# Import libraries
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
import datasense as ds

In [2]:
start_time = datetime.now()

In [3]:
# Set global parameters
pd.options.display.max_rows = None
pd.options.display.max_columns = None
filename = 'lunch_and_learn.csv'
target = 'Y'
features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7',
            'X8', 'X9', 'X10', 'X11', 'X12', 'X13']

In [4]:
# Read data file into a pandas DataFrame
data = pd.read_csv(filename)

In [5]:
# ds.dataframe_info(data, filename)

In [6]:
data.shape

(5000, 14)

In [7]:
data.columns

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
       'X12', 'X13', 'Y'],
      dtype='object')

In [8]:
# Describe the feature columns
# for column in features:
#     print(column)
#     result = ds.nonparametric_summary(data[column])
#     print(result, '\n')

In [9]:
mask_values = [
    ('X1', -20, 20),
    ('X2', -25, 25),
    ('X3', -5, 5),
    ('X4', -10, 10),
    ('X5', -3, 3),
    ('X6', -5, 5),
    ('X7', -13, 13),
    ('X8', -9, 15),
    ('X9', -17, 15),
    ('X10', -16, 15),
    ('X11', -16, 17),
    ('X12', -16, 17),
    ('X13', -20, 23)
]

In [10]:
for column, lowvalue, highvalue in mask_values:
    data[column]= data[column].mask(
        (data[column] <= lowvalue) |
        (data[column] >= highvalue)
    )

In [11]:
# Describe the feature columns
# for column in features:
#     print(column)
#     result = ds.nonparametric_summary(data[column])
#     print(result, '\n')

In [12]:
# Create training and testing data sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [13]:
X.shape

(5000, 13)

In [14]:
y.shape

(5000,)

In [15]:
X_train.shape

(3350, 13)

In [16]:
X_test.shape

(1650, 13)

In [17]:
y_train.shape

(3350,)

In [18]:
y_test.shape

(1650,)

In [19]:
end_time = datetime.now()
(end_time - start_time).total_seconds()

0.304135