# Machine Learning Pipelines Notes 
- Darshil Desai

Only put in stuff that is new to you, since you are already familiar with model fitting and training- do not overlap existing knowledge! 


In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, make_scorer,roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
import pickle
from sklearn.neighbors import LocalOutlierFactor as lof, DistanceMetric as dm
from scipy.spatial.distance import squareform, pdist

In [46]:
# Datasets
credit = pd.read_csv('credit.csv')
flows = pd.read_csv('lanl_flows.csv')
proteins = pd.read_csv('proteins_exercises.csv')

# Chapter 1

### 1. Label Encoding
- Similar to Pandas "to_categorical" method, this feature in scikit-learn allows you to numerically represent your categorical variables!


In [3]:
non_numeric_columns = ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status',
                      'other_parties', 'property_magnitude', 'other_payment_plans','housing', 'job', 'own_telephone','foreign_worker']

# Create a label encoder for each column. Encode the values
for column in non_numeric_columns:
    le = LabelEncoder()
    credit[column] = le.fit_transform(credit[column])
    
credit.head(1)    

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,6,1,4,1169,4,3,4,3,2,...,2,67,1,1,2,3,1,1,1,good


### 2. Param Grid

- This is an important concept that allows you to iterate over the HYPER-PARAMETERS of the model you choose and returns the one
with the best accuracy

In [4]:
try:
    y = credit['class']
    X = credit.drop('class', axis=1)    
except:
    print ('class col dropped')


In [5]:
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1)

# Set a range for n_estimators from 10 to 40 in steps of 10
param_grid = {'n_estimators': list(range(10, 50,10))}  # the hyper parameter here = n_estimators

# Not you define a model and simply plug in the hyper param values to iterate over
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)

grid.fit(X, y)
grid.best_params_  # the one that yields the best accuracy. 


# Question: does grid search work for contonous predictor variables? 

{'n_estimators': 20}

### 3. Categorical Encodings
This represents one-hot encoded categorical features! - Know this so will skip over explaining

In [6]:
# Create numeric encoding for credit_history
credit_history_num = LabelEncoder().fit_transform(
  credit['credit_history'])   # here we first transform the cats into numerical representation. Although not neccessary

# Create a new feature matrix including the numeric encoding
X_num = pd.concat([X, pd.Series(credit_history_num)], 1)

# Create new feature matrix with dummies for credit_history
X_hot = pd.concat(
  [X, pd.get_dummies(credit['credit_history'])],1)

# Compare the number of features of the resulting DataFrames
X_hot.shape[1] > X_num.shape[1]

True

## 4. f_classif - Statistical method for feature selection
Basically this helps us pick features that are continuous! Similar to how we pick categorical variables using chi-square tests

Note: 
"IF the features are quantitative, compute the ANOVA F-value between each feature and the target vector....The F-value scores examine **IF**, when we group the numerical feature by the target vector, the means for each group are significantly different."  - [chris_albon](https://chrisalbon.com/machine_learning/feature_selection/anova_f-value_for_feature_selection/)

In [7]:
# Function computing absolute difference from column mean
def abs_diff(x):
    return np.abs(x- np.mean(x))

# Apply it to the credit amount and store to new column
credit['credit_amount_diff'] = abs_diff(credit.credit_amount)

# Score old and new versions of this feature with f_classif()
scores = f_classif( credit[['credit_amount', 'credit_amount_diff']], credit['class'])[0]

# Inspect the scores and drop the lowest-scoring feature
credit_new = credit.drop(['credit_amount'], 1)

### ------------------ -------------------------- ------------------------ --------------------- ---------------------

# Chapter 2

### 4. Advanced Data Wrangling

- Take a look at some smart ways to wrangle data so you can draw inspiration from it in your feature work

In [8]:
flows.head(3)

Unnamed: 0,time,duration,source_computer,source_port,destination_computer,destination_port,protocol,packet_count,byte_count
0,471692,0,C5808,N24128,C26871,N17023,6,1,60
1,471692,0,C5808,N2414,C26871,N19148,6,1,60
2,471692,0,C5808,N24156,C26871,N8001,6,1,60


In [9]:
# Some function to apply on a dataframe
def featurize(df):
    """
    Takes in a dataframe and returns a dictionary with relevant information
    """
    return {
    'unique_ports': len(set(df['destination_port'])),
    'average_packet': np.mean(df['packet_count']),
    'average_duration': np.mean(df['duration'])
    }

#Group by source computer, and apply the feature extractor
out = flows.groupby('source_computer')  # This is an iterator object where each object is a dataframe grouped by the 
                                        #....."sourced computer" variable
    
out    

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000019476A0DC88>

In [10]:
"""
# Using apply to literaly apply the "featurize" function on top on each dataframe
    - Note the total number of rows, that means the iterator object had that many dataframes as each object!
"""

out = flows.groupby('source_computer').apply(featurize)

In [11]:
out[1:3]

source_computer
C10026    {'unique_ports': 2, 'average_packet': 21.0, 'a...
C10047    {'unique_ports': 5, 'average_packet': 21.07692...
dtype: object

In [12]:
# Convert the iterator to a dataframe by calling list on it
X = pd.DataFrame(list(out), index= out.index)

In [13]:
X.head(2)

Unnamed: 0_level_0,average_duration,average_packet,unique_ports
source_computer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C10,5.0,222.0,4
C10026,39.0,21.0,2


In [14]:
bads = ['C9945', 'C9723', 'C977', 'C10']

# Check which sources in X.index are bad to create labels
y = [x in bads for x in X.index]

In [15]:
# Report the average accuracy of Adaboost over 3-fold CV
print(np.mean(cross_val_score(AdaBoostClassifier(), X, y)))

0.9966414564404514




### 4. Advanced Feature Engineering

- Section in focus covers noteworthy ways in which to data wrangle

In [16]:

#SECTION IN FOCUS------------------------------
# Create a feature counting unique protocols per source
protocols = flows.groupby('source_computer').apply(
  lambda df: len(set(df.protocol)) )

"""
1. First the groupby simply creates an iterator object where each object is a df
2. next you use apply and lambda to calculate the number of unique values in the "protocol" column in EACH object/df !

So now each row has source_computer or ID and the unique protocols which is then added onto the main feature set
"""
#------------------------------


# Convert this feature into a dataframe, naming the column
protocols_DF = pd.DataFrame(
  protocols, index=protocols.index, columns=['protocol'])

# Now concatenate this feature with the previous dataset, X
X_more = pd.concat([X, protocols_DF], axis=1)  # could also use merge...

# Refit the classifier and report its accuracy
print(np.mean(cross_val_score(
  AdaBoostClassifier(), X_more, y)))


0.9966414564404514




In [17]:
# Interesting implementation of the uniqly package! Figure out how to iterate over all columns in one passing

protocols1 = flows.groupby('source_computer').apply(
  lambda df: list(set(df.protocol)) )

### 5. Sample Weights 

The idea here is that we can weight or give importance to some labels more than others! 

- For example: : _"One of your cyber analysts informs you that many of the labels for the first 100 source computers in your training data might be wrong because of a database error. She hopes you can still use the data because most of the labels are still correct, but asks you to treat these 100 labels as "noisy". Thankfully you know how to do that, using weighted learning. "_

    So here essentially in our entire training dataset the first 100 labels are "iffy" but the rest of the labels from 101 to whatever are fine! So we can still use this model by applying weights to the first 100 ! 

In [18]:
# Lets say that the first 100 labels were labeled using a heuristic process or some unreliable process
y_train_noisy = y_train  # assuming right

# Fit a classifier to the training data
clf = RandomForestClassifier().fit(X_train, y_train_noisy)

# Report its accuracy on the test data
print(accuracy_score(y_test, clf.predict(X_test)))

# Section in focus----------------------------------
    # Assign half the weight to the first 100 noisy examples
weights = [0.5]*100 + [1.0]*(len(y_train)-100)

"""
Here we set the weights of the first 100 to 0.5 and the rest after 100 to 1 which means normal!
- The weights are assigned arbitarily and intuitively 
"""

#-----------------------------------------

# Refit using weights and report accuracy. Has it improved?
clf_weights = RandomForestClassifier().fit(X_train, y_train_noisy, sample_weight=weights)
print(accuracy_score(y_test, clf_weights.predict(X_test)))

0.755
0.72




### 6. Classification Metrics: Confusion Matrix 

- Quick runthrough of how to go about returning classification metrics using SKlearn.

**Context**  _a customer who defaulted on their loan, and a "negative" means a customer who continued to pay without problems. The bank manager informed you that the bank makes 10K profit on average from each "good risk" customer, but loses 150K from each "bad risk" customer._

In [19]:
# Fit a random forest classifier to the training data
clf = RandomForestClassifier(random_state=2).fit(X_train, y_train)

# Label the test data
preds = clf.predict(X_test)

# Section in focus------------------------------------
    # Get confusion matrix only
confusion_matrix(y_test, preds)

    # Get false positives/negatives from the confusion matrix
tp, fp, fn, tn = confusion_matrix(y_test, preds).ravel()  # using ravel flattens the confusion matrix 

# Now compute the cost using the manager's advice
cost = fp*10 + fn*150

"""
Note the concept of cost: In our case, since a bad customer identified as good is more expensive than a good customer identified 
as good, the cost value above assigns a monetary value ($-thousands) to the overall classficiation metrics
"""
#--------------------------------------------



'\nNote the concept of cost: In our case, since a bad customer identified as good is more expensive than a good customer identified \nas good, the cost value above assigns a monetary value ($-thousands) to the overall classficiation metrics\n'

### 7. Optimizing Classification Thresholds

The concept of optimizing classification thresholds refers to the idea that in order to reduce false positives or fine tune your confusion matrix properties- you can manually specify or iterate over different threshold values and use that to predict!

In [20]:
clf = RandomForestClassifier(random_state=2).fit(X_train, y_train)
scores = clf.predict_proba(X_test)
y_test = y_test == 'bad'

# Section in focus ------------------------------------
# Create a range of equally spaced threshold values
t_range = [0,0.25,0.5,0.75,1]

# Store the predicted labels for each value of the threshold
preds = [[s[1] > thr for s in scores] for thr in t_range]

# Compute the accuracy for each threshold
accuracies = [accuracy_score(y_test, p) for p in preds]

# Compute the F1 score for each threshold
f1_scores = [f1_score(y_test, p) for p in preds]

"""
1. First we create a list of threshold values to iterate over
2. In a nested list comprehension we use the prediction probabiities we got from predict_proba and 
simply assign 0 or 1 if the predicted probability of the class being 1 > threshold
3. Then we calculate the accuracy score for each and figure out which threshold yielded the best accu and f1scores!
"""

#-----------------------------------------------------


# Report the optimal threshold for accuracy, and for F1
print(t_range[np.argmax(accuracies)], t_range[np.argmax(f1_scores)])



1 0


  'precision', 'predicted', average, warn_for)


# -------------- -------------- -------------- -------------- --------------

# Chapter 3 
- Model Life Cycle Management

### 8. Machine Learning Pipelines

The big idea here is to be able to iterate over different versions of models, within which you iterate over >1 types of hyperparamters and the get the best one


In [21]:
# Create pipeline with feature selector and classifier
pipe = Pipeline([
    ('feature_selection', SelectKBest(f_classif)),
    ('clf', RandomForestClassifier(random_state=2))])

# Create a parameter grid
params = {
   'feature_selection__k':[10,20],
    'clf__n_estimators':[2, 5]}

# Initialize the grid search object
grid_search = GridSearchCV(pipe, param_grid=params)

"""
Note:
1. First we set up a pipeline instance wherein we want to iterate over 2 things: number of continous features & 
...the n_estimator hyperparameter of the RandomForestClassifier model. 
    
2. Then we create the parameters that will be iterated OVER. 
    Example: f
    - feature_selection__k will have 10 and 20 iterated over as the K/number of features to select using SelectKBest(f_classif). 
    - clf__n_estimators: same here the n_estimator parameter iterates over values 2,5  

    Keep in mind:
    => While naming parameters always to "pipelineName__K"
    => Note that all possible combinations are covered! So dont include lots of pipeline features as it will drastically 
    push the computational expense!
"""

# Fit it to the data and print the best value combination
print(grid_search.fit(X_train,y_train).best_params_)



{'clf__n_estimators': 5, 'feature_selection__k': 10}


#### Customizing the metric used in the pipeline

By default the pipeline picks the best combination of hyper parameters that yields the highest accuracy! 

In [22]:
y_train = y_train == 'bad'

# Create pipeline with feature selector and classifier
pipe = Pipeline([
    ('feature_selection', SelectKBest(f_classif)),
    ('clf', RandomForestClassifier(random_state=2))])

# Create a parameter grid
params = {
   'feature_selection__k':[10,20],
    'clf__n_estimators':[2, 5]}

#Section in focus ------------------------------------
# Create a custom scorer
scorer = make_scorer(f1_score)

# Initialize the CV object
gs = GridSearchCV(pipe, param_grid=params, scoring=scorer)

"""
Note: 
    1. Here we make use our custom chosen metric using the make_scorer
    2 then we pass it into the gridsearch process which later will yield the best
    hyper -parameters for the optimized value of the metric fed in! 
"""

#-----------------------------------------------------------------

# Fit it to the data and print the winning combination
print(gs.fit(X_train, y_train).best_params_)



{'clf__n_estimators': 5, 'feature_selection__k': 10}


### 9. Deploying Machine Learning in Production Environments

In this section we will learn how to deploy an ML model into production by using Pickle. Note, although we've already done this using Joblib but in this section we will see interesting ways in how SKlearn's pipeline feature is useful in not only saving the model parameters but also the data transformations that preceed it

In [23]:
# Fit a random forest to the training set
clf = RandomForestClassifier(random_state=42).fit(
  X_train, y_train)

#Section in focus-------------------------------------

    # Save it to a file, to be pushed to production
with open('model.pkl', 'wb') as file:
    pickle.dump(clf, file=file)
    
    # Now load the model from file in the production environment
with open('model.pkl', 'rb') as file:
    clf_from_file = pickle.load(file)

    # Predict the labels of the test dataset
preds = clf_from_file.predict(X_test)

"""
Note:
    1. first we save our model in a binary Pickle file. Same as you would Joblib. Note the 'wb' means write binary
    2. Next we load and predict. 
"""

#-----------------------------------------------------



"\nNote:\n    1. first we save our model in a binary Pickle file. Same as you would Joblib. Note the 'wb' means write binary\n    2. Next we load and predict. \n"

#### Applying Data Transformations in Production

This section shows how to apply the neccessary data transformations needed on production/out-sample data before feeding it into the model

In [24]:

#Section in focus------------------------------

    # Define a feature extractor to flag very large values
def more_than_average(X, multiplier=1.0):    
    Z = X.copy()
    Z[:,1] = Z[:,1] > multiplier*np.mean(Z[:,1])
    return Z

    # Convert your function so that it can be used in a pipeline
pipe = Pipeline([
  ('ft', FunctionTransformer(more_than_average)),
  ('clf', RandomForestClassifier(random_state=2))])

    # Optimize the parameter multiplier using GridSearchCV
params = {
   'ft__multiplier':[0.5,1,2,3]}

"""
Note: 
    1. Here we first define a custom transformation function called more_than_average that deals with the second column/value
    of an incoming numpy matrix or array
    2. We pass that in the pipeline using FunctionTransformer 
    3. Finally we use the params grid search method to iterate over this multiplier to check which transformation yields
    the best metric!
"""

#--------------------------------------------------
grid_search = GridSearchCV(pipe, param_grid=params)

# Fit it to the data and print the winning combination
print(gs.fit(X_train, y_train).best_params_)



{'clf__n_estimators': 5, 'feature_selection__k': 10}


#### Agile Software Development Practices in Model Deployment


**Context:** _Having pushed your random forest to production, you suddenly worry that a naive Bayes classifier might be better. You want to run a champion-challenger test, by comparing a naive Bayes, acting as the challenger, to exactly the model which is currently in production, which you will load from file to make sure there is no confusion. You will use the F1 score for assessment_. 


In this section we will look at how to go about switching out models in production with relative ease. Our context suggests that an NB model may be better and worth taking a look. This is what we call a _"challenger"_ model, since we're sort of challenging the current model in production. The current model in production is a RandomForestClassifier which we can refer to as the _"champion"_



In [25]:
# Load the CURRENT model from disk
champion = pickle.load(open('model.pkl', 'rb'))

# Fit a Gaussian Naive Bayes to the training data. 
challenger = GaussianNB().fit(X_train, y_train)  # The NEW OR CHALLENGER model to try out

# Section in focus ----------------------------------------
    # Print the F1 test scores of both champion and challenger
print(f1_score(y_test, champion.predict(X_test)))
print(f1_score(y_test, challenger.predict(X_test)))

"""
Note:
    1. simply we load up both models - champ and challenger
    2. compare f1 scores and save into pickle (hence deploy) the best performing one
"""

#----------------------------------------------------------

# Write back to disk the best-performing model
with open('model.pkl', 'wb') as file:
    pickle.dump(champion, file=file)

0.5294117647058824
0.46428571428571425


### 10. Production level phenomenons: Dataset shift & Domai Shift

Datashifts are very interesting phenonmenons that occur when model results/predictions in a production environment do not replicate the quality of the results in development environment.

2 Reasons as to why: 

1. **Dataset shift:** This is caused due to temporal changes in the production data. For instance: if you trained on a dataset before of a company BEFORE they made some changes in the way they calculate their features then you can see this diff in the production environment

    How to spot this? : Using the concept of **window slides** where basically you take different sections of the training data pertaining to different time period and test it out  **(EXPAND ON THIS)**


2. Domain Shift: Intuively easier to understand. If you trained your model on all adult males and then in production you predicted on female adults and kids the predictions would be off!

In [27]:
wrange = [1,2,3]

# Loop over window sizes
for w_size in wrange:

# Section in focus----------------------------    
    
    # Define sliding window
    sliding = arrh.loc[(t_now -w_size+1):t_now]

    # Extract X and y from the sliding window
    X, y = sliding.drop('class', 1), sliding['class']
    
    # Fit the classifier and store the F1 score
    preds = GaussianNB().fit(X, y).predict(X_test)
    accuracies.append(f1_score(y_test, preds))

"""
Note: 
    1. We loop over different sliding windows / sections of the entire dataset
    2. Next we fit the X, Y of that WINDOW
    3. predict and append the f1 score
    4. Finally we get the value of the window that gave the best f1_score / metric 
"""
    
#------------------------------------------    
    
# Estimate the best performing window size
optimal_window = wrange[np.argmax(accuracies)]

NameError: name 'arrh' is not defined

# -------------- -------------- -------------- -------------- --------------

# Chapter 4 
- Unsupervised Workflows refer to implementing ML workflows in the space of unsupervised algorithms. The approaches to supervised and unsupervised, although overlapping, can starkly differ in some aspects of model fitting and training. Lets explore! 

### 11 Local Anomaly Detection algorithm

This section covers a simple LOCAL anomaly detection algorithm wherein each point is observed in relation to the datapoints around it and if its "alone" then its an outlier. This is different from a global anamoly detection system wherein outliers outside the range are considered to be anamolies


_Context_ :
------------------

a. Outlier detection:
 	The training data contains outliers which are defined as observations that are far from the others. Outlier detection estimators thus try to fit the regions where the training data is the most concentrated, ignoring the deviant observations.
novelty detection:
 	The training data is not polluted by outliers and we are interested in detecting whether a new observation is an outlier. In this context an outlier is also called a novelty.    
    
    
b. Novelty detection:
 	The training data is not polluted by outliers and we are interested in detecting whether a new observation is an outlier. In this context an outlier is also called a novelty.    
    
[Sklearn Documentation](https://scikit-learn.org/stable/modules/outlier_detection.html)

In [28]:
x = [1]*30
x.append(10)

X = pd.DataFrame(x)

# Fit the local outlier factor and print the outlier scores
print(lof().fit_predict(X))

# Create the list [1.0, 1.0, ..., 1.0, 10.0] as explained
x = [1.0]*30
x.append(10.0)  # So this 10 is the anomaly

# Cast to a data frame
X = pd.DataFrame(x)

# Fit the local outlier factor and print the outlier scores
print(lof().fit_predict(X))  # if 1 then its normal, if -1 then  the datapoint is an anomaly

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1 -1]
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1 -1]




#### Novelty Anomaly Detection algorithm

Novelty anomaly detection algorithms simply focus on predicting on newor out-sample datasets. Theres different types including one-class SVM, local novelty detection (**shown below**) , Isolation forests and others. 

**The concept of contamination:**  _The amount of contamination of the data set, i.e. the proportion of outliers in the data set. When fitting this is used to define the threshold on the decision function._

In [29]:
# Create a list of thirty 1s and cast to a dataframe
X = pd.DataFrame([1.0]*30)

# Create an instance of a lof novelty detector
detector = lof(novelty=True)

# Fit the detector to the data
detector.fit(X)

# Use it to predict the label of an example with value 10.0
print(detector.predict(pd.DataFrame([10])))  # new test datapoint 

[-1]




#### Customizing thresholds using Score Samples

The `score_samples(X_test)` method allows you get raw scores for the values in test being outliers. The LOWER the score, the more anomalous the data point

In [None]:

# Fit a one-class SVM detector and score the test data
nov_det = onesvm().fit(X_train)


# Section in focus-----------------------------------
scores = nov_det.score_samples(X_test)

# Find the observed proportion of outliers in the test data
prop = np.mean(y_test==1)

# Compute the appropriate threshold
threshold = np.quantile(scores, prop)

"""
Note:
    - scores = the raw scores for anomaly. Low = more anomolos
    - we find the proportion of anomalies in the y of the test dataset
    - in scores if the value is above the threshold then its an anomaly. This is where the custom comes into play
"""
#---------------------------------------------------------


# Print the confusion matrix for the thresholded scores
print(confusion_matrix(y_test, scores > threshold))


## 12. Distance & Similarity

In this sectionn we will cover how to use distance metrics and incorporate them as parameters in the outlier model fitting process. Why? because the any _local outlier factor algorithm depends a lot on the idea of a nearest neighbor, which in turn depends on the choice of distance metric._

But first a quick recap of diff distance metrics! : 

- a. Euclidean distance : sqrt of sum of squared diff 
- b. Chebyshev: element wise distance calculation and then the max distance is taken
- c. Hamming distance: quite the opposite of the above 2, wherein the closest hamming distance is the furthest euclidean distance. Example: vectors incorporating product IDs between lets say 2 customers. Similar purchases will be grouped together





In [32]:
# setting the dataframe
features = X_train

# Find the Euclidean distance between all pairs
dist_eucl = dm.get_metric('euclidean').pairwise(features)

# Find the Hamming distance between all pairs
dist_hamm = dm.get_metric('hamming').pairwise(features)

# Find the Chebyshev distance between all pairs
dist_cheb = dm.get_metric('chebyshev').pairwise(features)

## 13. Unstructured Data

Unstructured datasets are unlike the tabular datasets we've used such that there exist a fixed number of features. Some examples of unstructured datasets are : 

- Audio data
- Image data (before resizing all to one size)
- Sequence DNA data. ex: ABBB, ABBC etc 

In [53]:
# Wrap the RD-Levenshtein metric in a custom function
def my_rdlevenshtein(u, v):
    return stringdist.rdlevenshtein(u[0], v[0])

# Reshape the array into a numpy matrix
sequences = np.array(proteins['seq']).reshape(-1, 1)

# Compute the pairwise distance matrix in square form
M = squareform(pdist(sequences, my_rdlevenshtein))

# Run a LoF algorithm on the precomputed distance matrix
preds = lof(metric='precomputed').fit_predict(M)

# Compute the accuracy of the outlier predictions
print(accuracy( (proteins['label']=='VIRUS'), (preds == -1) ))

NameError: name 'stringdist' is not defined

In [71]:
# Create a feature encoding the first letter of the string
proteins['first'] =  proteins['seq'].str[0]

In [75]:
# Create a feature that contains the length of the string
proteins['len'] = proteins['seq'].str.len()

# Create a feature encoding the first letter of the string
proteins['first'] =  LabelEncoder().fit_transform(proteins['seq'].str[0])

In [78]:
# Extract scores from the fitted LoF object, compute its AUC
scores_lof = lof_detector.score_samples

NameError: name 'lof_detector' is not defined

In [80]:
proteins['label']=='VIRUS'

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8      True
9      True
10     True
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23     True
24    False
25    False
26    False
27    False
28    False
29    False
      ...  
70    False
71    False
72    False
73    False
74    False
75    False
76    False
77    False
78    False
79    False
80    False
81    False
82    False
83    False
84    False
85    False
86    False
87     True
88     True
89     True
90     True
91     True
92    False
93    False
94    False
95     True
96     True
97    False
98    False
99    False
Name: label, Length: 100, dtype: bool