# Machine Learning Pipelines Notes 
- Darshil Desai

Only put in stuff that is new to you, since you are already familiar with model fitting and training- do not overlap existing knowledge! 


In [131]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.feature_extraction import CountV 
from sklearn.feature_selection import f_classif, chi2, SelectKBest

In [62]:
# Datasets
credit = pd.read_csv('credit.csv')
flows = pd.read_csv('lanl_flows.csv')

# Chapter 1

### 1. Label Encoding
- Similar to Pandas "to_categorical" method, this feature in scikit-learn allows you to numerically represent your categorical variables!


In [10]:
non_numeric_columns = ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status',
                      'other_parties', 'property_magnitude', 'other_payment_plans','housing', 'job', 'own_telephone','foreign_worker']

# Create a label encoder for each column. Encode the values
for column in non_numeric_columns:
    le = LabelEncoder()
    credit[column] = le.fit_transform(credit[column])
    
credit.head(1)    

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,6,1,4,1169,4,3,4,3,2,...,2,67,1,1,2,3,1,1,1,good


### 2. Param Grid

- This is an important concept that allows you to iterate over the HYPER-PARAMETERS of the model you choose and returns the one
with the best accuracy

In [23]:
try:
    y = credit['class']
    X = credit.drop('class', axis=1)    
except:
    print ('class col dropped')


In [31]:
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1)

# Set a range for n_estimators from 10 to 40 in steps of 10
param_grid = {'n_estimators': list(range(10, 50,10))}  # the hyper parameter here = n_estimators

# Not you define a model and simply plug in the hyper param values to iterate over
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)

grid.fit(X, y)
grid.best_params_  # the one that yields the best accuracy. 


# Question: does grid search work for contonous predictor variables? 

{'n_estimators': 30}

### 3. Categorical Encodings
This represents one-hot encoded categorical features! - Know this so will skip over explaining

In [32]:
# Create numeric encoding for credit_history
credit_history_num = LabelEncoder().fit_transform(
  credit['credit_history'])   # here we first transform the cats into numerical representation. Although not neccessary

# Create a new feature matrix including the numeric encoding
X_num = pd.concat([X, pd.Series(credit_history_num)], 1)

# Create new feature matrix with dummies for credit_history
X_hot = pd.concat(
  [X, pd.get_dummies(credit['credit_history'])],1)

# Compare the number of features of the resulting DataFrames
X_hot.shape[1] > X_num.shape[1]

True

## 4. f_classif - Statistical method for feature selection
Basically this helps us pick features that are continuous! Similar to how we pick categorical variables using chi-square tests

Note: 
"IF the features are quantitative, compute the ANOVA F-value between each feature and the target vector....The F-value scores examine **IF**, when we group the numerical feature by the target vector, the means for each group are significantly different."  - [chris_albon](https://chrisalbon.com/machine_learning/feature_selection/anova_f-value_for_feature_selection/)

In [53]:
# Function computing absolute difference from column mean
def abs_diff(x):
    return np.abs(x- np.mean(x))

# Apply it to the credit amount and store to new column
credit['credit_amount_diff'] = abs_diff(credit.credit_amount)

# Score old and new versions of this feature with f_classif()
scores = f_classif( credit[['credit_amount', 'credit_amount_diff']], credit['class'])[0]

# Inspect the scores and drop the lowest-scoring feature
credit_new = credit.drop(['credit_amount'], 1)

### ------------------ -------------------------- ------------------------ --------------------- ---------------------

# Chapter 2

### 4. Advanced Data Wrangling

- Take a look at some smart ways to wrangle data so you can draw inspiration from it in your feature work

In [70]:
flows.head(3)

Unnamed: 0,time,duration,source_computer,source_port,destination_computer,destination_port,protocol,packet_count,byte_count
0,471692,0,C5808,N24128,C26871,N17023,6,1,60
1,471692,0,C5808,N2414,C26871,N19148,6,1,60
2,471692,0,C5808,N24156,C26871,N8001,6,1,60


In [75]:
# Some function to apply on a dataframe
def featurize(df):
    """
    Takes in a dataframe and returns a dictionary with relevant information
    """
    return {
    'unique_ports': len(set(df['destination_port'])),
    'average_packet': np.mean(df['packet_count']),
    'average_duration': np.mean(df['duration'])
    }

#Group by source computer, and apply the feature extractor
out = flows.groupby('source_computer')  # This is an iterator object where each object is a dataframe grouped by the 
                                        #....."sourced computer" variable
    
out    

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000196D97357F0>

In [84]:
"""
# Using apply to literaly apply the "featurize" function on top on each dataframe
    - Note the total number of rows, that means the iterator object had that many dataframes as each object!
"""

out = flows.groupby('source_computer').apply(featurize)

In [102]:
out[1:3]

source_computer
C10026    {'unique_ports': 2, 'average_packet': 21.0, 'a...
C10047    {'unique_ports': 5, 'average_packet': 21.07692...
dtype: object

In [92]:
# Convert the iterator to a dataframe by calling list on it
X = pd.DataFrame(list(out), index= out.index)

In [104]:
X.head(2)

Unnamed: 0_level_0,average_duration,average_packet,unique_ports
source_computer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C10,5.0,222.0,4
C10026,39.0,21.0,2


In [126]:
bads = ['C9945', 'C9723', 'C977', 'C10']

# Check which sources in X.index are bad to create labels
y = [x in bads for x in X.index]

In [132]:
# Report the average accuracy of Adaboost over 3-fold CV
print(np.mean(cross_val_score(AdaBoostClassifier(), X, y)))

0.9966414564404514




### 4. Advanced Feature Engineering

- Section in focus

In [149]:


#SECTION IN FOCUS------------------------------
# Create a feature counting unique protocols per source
protocols = flows.groupby('source_computer').apply(
  lambda df: len(set(df.protocol)) )

"""
1. First the groupby simply creates an iterator object where each object is a df
2. next you use apply and lambda to calculate the unique protocol in EACH object/df !

So now each row has source_computer or ID and the unique protocols which is then added onto the main feature set
"""
#------------------------------


# Convert this feature into a dataframe, naming the column
protocols_DF = pd.DataFrame(
  protocols, index=protocols.index, columns=['protocol'])

# Now concatenate this feature with the previous dataset, X
X_more = pd.concat([X, protocols_DF], axis=1)  # could also use merge...

# Refit the classifier and report its accuracy
print(np.mean(cross_val_score(
  AdaBoostClassifier(), X_more, y)))


0.9966414564404514


