# Machine Learning Pipelines Notes 
- Darshil Desai

Only put in stuff that is new to you, since you are already familiar with model fitting and training- do not overlap existing knowledge! 


In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.feature_extraction import CountV 
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline

In [2]:
# Datasets
credit = pd.read_csv('credit.csv')
flows = pd.read_csv('lanl_flows.csv')

# Chapter 1

### 1. Label Encoding
- Similar to Pandas "to_categorical" method, this feature in scikit-learn allows you to numerically represent your categorical variables!


In [3]:
non_numeric_columns = ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status',
                      'other_parties', 'property_magnitude', 'other_payment_plans','housing', 'job', 'own_telephone','foreign_worker']

# Create a label encoder for each column. Encode the values
for column in non_numeric_columns:
    le = LabelEncoder()
    credit[column] = le.fit_transform(credit[column])
    
credit.head(1)    

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,6,1,4,1169,4,3,4,3,2,...,2,67,1,1,2,3,1,1,1,good


### 2. Param Grid

- This is an important concept that allows you to iterate over the HYPER-PARAMETERS of the model you choose and returns the one
with the best accuracy

In [4]:
try:
    y = credit['class']
    X = credit.drop('class', axis=1)    
except:
    print ('class col dropped')


In [5]:
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1)

# Set a range for n_estimators from 10 to 40 in steps of 10
param_grid = {'n_estimators': list(range(10, 50,10))}  # the hyper parameter here = n_estimators

# Not you define a model and simply plug in the hyper param values to iterate over
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)

grid.fit(X, y)
grid.best_params_  # the one that yields the best accuracy. 


# Question: does grid search work for contonous predictor variables? 

{'n_estimators': 40}

### 3. Categorical Encodings
This represents one-hot encoded categorical features! - Know this so will skip over explaining

In [6]:
# Create numeric encoding for credit_history
credit_history_num = LabelEncoder().fit_transform(
  credit['credit_history'])   # here we first transform the cats into numerical representation. Although not neccessary

# Create a new feature matrix including the numeric encoding
X_num = pd.concat([X, pd.Series(credit_history_num)], 1)

# Create new feature matrix with dummies for credit_history
X_hot = pd.concat(
  [X, pd.get_dummies(credit['credit_history'])],1)

# Compare the number of features of the resulting DataFrames
X_hot.shape[1] > X_num.shape[1]

True

## 4. f_classif - Statistical method for feature selection
Basically this helps us pick features that are continuous! Similar to how we pick categorical variables using chi-square tests

Note: 
"IF the features are quantitative, compute the ANOVA F-value between each feature and the target vector....The F-value scores examine **IF**, when we group the numerical feature by the target vector, the means for each group are significantly different."  - [chris_albon](https://chrisalbon.com/machine_learning/feature_selection/anova_f-value_for_feature_selection/)

In [7]:
# Function computing absolute difference from column mean
def abs_diff(x):
    return np.abs(x- np.mean(x))

# Apply it to the credit amount and store to new column
credit['credit_amount_diff'] = abs_diff(credit.credit_amount)

# Score old and new versions of this feature with f_classif()
scores = f_classif( credit[['credit_amount', 'credit_amount_diff']], credit['class'])[0]

# Inspect the scores and drop the lowest-scoring feature
credit_new = credit.drop(['credit_amount'], 1)

### ------------------ -------------------------- ------------------------ --------------------- ---------------------

# Chapter 2

### 4. Advanced Data Wrangling

- Take a look at some smart ways to wrangle data so you can draw inspiration from it in your feature work

In [8]:
flows.head(3)

Unnamed: 0,time,duration,source_computer,source_port,destination_computer,destination_port,protocol,packet_count,byte_count
0,471692,0,C5808,N24128,C26871,N17023,6,1,60
1,471692,0,C5808,N2414,C26871,N19148,6,1,60
2,471692,0,C5808,N24156,C26871,N8001,6,1,60


In [9]:
# Some function to apply on a dataframe
def featurize(df):
    """
    Takes in a dataframe and returns a dictionary with relevant information
    """
    return {
    'unique_ports': len(set(df['destination_port'])),
    'average_packet': np.mean(df['packet_count']),
    'average_duration': np.mean(df['duration'])
    }

#Group by source computer, and apply the feature extractor
out = flows.groupby('source_computer')  # This is an iterator object where each object is a dataframe grouped by the 
                                        #....."sourced computer" variable
    
out    

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002F397917F60>

In [10]:
"""
# Using apply to literaly apply the "featurize" function on top on each dataframe
    - Note the total number of rows, that means the iterator object had that many dataframes as each object!
"""

out = flows.groupby('source_computer').apply(featurize)

In [11]:
out[1:3]

source_computer
C10026    {'unique_ports': 2, 'average_packet': 21.0, 'a...
C10047    {'unique_ports': 5, 'average_packet': 21.07692...
dtype: object

In [12]:
# Convert the iterator to a dataframe by calling list on it
X = pd.DataFrame(list(out), index= out.index)

In [13]:
X.head(2)

Unnamed: 0_level_0,average_duration,average_packet,unique_ports
source_computer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C10,5.0,222.0,4
C10026,39.0,21.0,2


In [14]:
bads = ['C9945', 'C9723', 'C977', 'C10']

# Check which sources in X.index are bad to create labels
y = [x in bads for x in X.index]

In [15]:
# Report the average accuracy of Adaboost over 3-fold CV
print(np.mean(cross_val_score(AdaBoostClassifier(), X, y)))



0.9966414564404514


### 4. Advanced Feature Engineering

- Section in focus covers noteworthy ways in which to data wrangle

In [16]:

#SECTION IN FOCUS------------------------------
# Create a feature counting unique protocols per source
protocols = flows.groupby('source_computer').apply(
  lambda df: len(set(df.protocol)) )

"""
1. First the groupby simply creates an iterator object where each object is a df
2. next you use apply and lambda to calculate the number of unique values in the "protocol" column in EACH object/df !

So now each row has source_computer or ID and the unique protocols which is then added onto the main feature set
"""
#------------------------------


# Convert this feature into a dataframe, naming the column
protocols_DF = pd.DataFrame(
  protocols, index=protocols.index, columns=['protocol'])

# Now concatenate this feature with the previous dataset, X
X_more = pd.concat([X, protocols_DF], axis=1)  # could also use merge...

# Refit the classifier and report its accuracy
print(np.mean(cross_val_score(
  AdaBoostClassifier(), X_more, y)))


0.9966414564404514




In [17]:
# Interesting implementation of the uniqly package! Figure out how to iterate over all columns in one passing

protocols1 = flows.groupby('source_computer').apply(
  lambda df: list(set(df.protocol)) )

### 5. Sample Weights 

The idea here is that we can weight or give importance to some labels more than others! 

- For example: : _"One of your cyber analysts informs you that many of the labels for the first 100 source computers in your training data might be wrong because of a database error. She hopes you can still use the data because most of the labels are still correct, but asks you to treat these 100 labels as "noisy". Thankfully you know how to do that, using weighted learning. "_

    So here essentially in our entire training dataset the first 100 labels are "iffy" but the rest of the labels from 101 to whatever are fine! So we can still use this model by applying weights to the first 100 ! 

In [18]:
# Lets say that the first 100 labels were labeled using a heuristic process or some unreliable process
y_train_noisy = y_train  # assuming right

# Fit a classifier to the training data
clf = RandomForestClassifier().fit(X_train, y_train_noisy)

# Report its accuracy on the test data
print(accuracy_score(y_test, clf.predict(X_test)))

# Section in focus----------------------------------
    # Assign half the weight to the first 100 noisy examples
weights = [0.5]*100 + [1.0]*(len(y_train)-100)

"""
Here we set the weights of the first 100 to 0.5 and the rest after 100 to 1 which means normal!
- The weights are assigned arbitarily and intuitively 
"""

#-----------------------------------------

# Refit using weights and report accuracy. Has it improved?
clf_weights = RandomForestClassifier().fit(X_train, y_train_noisy, sample_weight=weights)
print(accuracy_score(y_test, clf_weights.predict(X_test)))

0.765
0.725




### 6. Classification Metrics: Confusion Matrix 

- Quick runthrough of how to go about returning classification metrics using SKlearn.

**Context**  _a customer who defaulted on their loan, and a "negative" means a customer who continued to pay without problems. The bank manager informed you that the bank makes 10K profit on average from each "good risk" customer, but loses 150K from each "bad risk" customer._

In [19]:
# Fit a random forest classifier to the training data
clf = RandomForestClassifier(random_state=2).fit(X_train, y_train)

# Label the test data
preds = clf.predict(X_test)

# Section in focus------------------------------------
    # Get confusion matrix only
confusion_matrix(y_test, preds)

    # Get false positives/negatives from the confusion matrix
tp, fp, fn, tn = confusion_matrix(y_test, preds).ravel()  # using ravel flattens the confusion matrix 

# Now compute the cost using the manager's advice
cost = fp*10 + fn*150

"""
Note the concept of cost: In our case, since a bad customer identified as good is more expensive than a good customer identified 
as good, the cost value above assigns a monetary value ($-thousands) to the overall classficiation metrics
"""
#--------------------------------------------



'\nNote the concept of cost: In our case, since a bad customer identified as good is more expensive than a good customer identified \nas good, the cost value above assigns a monetary value ($-thousands) to the overall classficiation metrics\n'

### 7. Optimizing Classification Thresholds

The concept of optimizing classification thresholds refers to the idea that in order to reduce false positives or fine tune your confusion matrix properties- you can manually specify or iterate over different threshold values and use that to predict!



In [31]:
clf = RandomForestClassifier(random_state=2).fit(X_train, y_train)
scores = clf.predict_proba(X_test)
y_test = y_test == 'bad'

# Section in focus ------------------------------------
# Create a range of equally spaced threshold values
t_range = [0,0.25,0.5,0.75,1]

# Store the predicted labels for each value of the threshold
preds = [[s[1] > thr for s in scores] for thr in t_range]

# Compute the accuracy for each threshold
accuracies = [accuracy_score(y_test, p) for p in preds]

# Compute the F1 score for each threshold
f1_scores = [f1_score(y_test, p) for p in preds]

"""
1. First we create a list of threshold values to iterate over
2. In a nested list comprehension we use the prediction probabiities we got from predict_proba and 
simply assign 0 or 1 if the predicted probability of the class being 1 > threshold
3. Then we calculate the accuracy score for each and figure out which threshold yielded the best accu and f1scores!
"""

#-----------------------------------------------------


# Report the optimal threshold for accuracy, and for F1
print(t_range[np.argmax(accuracies)], t_range[np.argmax(f1_scores)])

1 0


# -------------- -------------- -------------- -------------- --------------

# Chapter 3 
- Model Life Cycle Management

### 8. Machine Learning Pipelines

The big idea here is to be able to iterate over different versions of models, within which you iterate over >1 types of hyperparamters and the get the best one


In [35]:
# Create pipeline with feature selector and classifier
pipe = Pipeline([
    ('feature_selection', SelectKBest(f_classif)),
    ('clf', RandomForestClassifier(random_state=2))])

# Create a parameter grid
params = {
   'feature_selection__k':[10,20],
    'clf__n_estimators':[2, 5]}

# Initialize the grid search object
grid_search = GridSearchCV(pipe, param_grid=params)

"""
Note:
1. First we set up a pipeline instance wherein we want to iterate over 2 things: number of continous features & 
...the n_estimator hyperparameter of the RandomForestClassifier model. 
    
2. Then we create the parameters that will be iterated OVER. 
    Example: f
    - feature_selection__k will have 10 and 20 iterated over as the K/number of features to select using SelectKBest(f_classif). 
    - clf__n_estimators: same here the n_estimator parameter iterates over values 2,5  

    Keep in mind:
    => While naming parameters always to "pipelineName__K"
    => Note that all possible combinations are covered! So dont include lots of pipeline features as it will drastically 
    push the computational expense!
"""

# Fit it to the data and print the best value combination
print(grid_search.fit(X_train,y_train).best_params_)



{'clf__n_estimators': 5, 'feature_selection__k': 10}
