# Model A: Centroid Anomaly Detection
The first model is a Centroid Amonaly Detection algorithm. See ```Method 1 - Centroid Based Algorithm.ipynb``` for a more detailed explanation of the algorithm, as well as development.

In [None]:
def model_A_initialize(X_train):
    '''
    This function initializes the centroid anomaly detection algorithm. It makes sure that the initial data is mean
    normalized, which makes calculating the centroid trivial (it is zeros). It calculates the distance each data point
    is from the centroid and uses the standard devation of the distances to calculate r, the threshold distance for
    classifying points as outliers or inliers.
    
    Inputs:
        X_train: a DataFrame of features. All features will be used in the algorithm.
        
    Outputs:
        centroid: the centroid of the training data, an array of zeros
        r: the threshold distance from the centroid
        n: the number of training datapoints
    '''
    #imports
    import pandas as pd
    import numpy as np
    
    #mean normalize the input
    X_train = (X_train - X_train.mean()) / (X_train.std())

    # the centroid should be zero, so I'm hardcoding that to avoid rounding errors
    centroid = np.zeros(X_train.shape[1])
    
    # calculate r from data
    train_distances = np.linalg.norm(X_train, axis=1) #centroid is zero, so distance is just the norm
    multiplier = 2.9 #determined through validation
    sigma = train_distances.std() #the standard deviation of the distances
    r = multiplier * sigma #the distance threshold for outliers
    
    n = X_train.shape[0]
    
    return centroid, r, n

In [None]:
def model_A_update(X_update, centroid, r, n):
    '''
    This function updates the centroid model based on new data in X_update. First, it predicts which data points are
    outliers. It does this by calculating the distance of the points from the centroid. If the distance is greater than
    or equal to the threshold, r, then the point is an outlier. If a majority of the points are not outliers, the
    centroid gets updated.
    
    Inputs:
        X_update: a DataFrame of features. All features will be used in the algorithm.
        
    Outputs:
        centroid: the centroid of the training data
        r: the threshold distance from the centroid
        n: the number of training datapoints
        y_predicted: returns the predictions for the data point. 0=inlier, 1=outlier
    '''
    #imports
    import pandas as pd
    import numpy as np
    
    #make sure X_update is mean normalized
    X_update = (X_update - X_update.mean()) / (X_update.std())

    # first, see if majority of the points are predicted to be outliers
    update_distances = np.linalg.norm(X_update - centroid, axis=1)
    y_predicted = np.zeros(X_update.shape[0]) #start with zeros
    y_predicted[np.where(update_distances >= r)] = 1 #change to 1's where >= r
    frac_outliers = y_predicted.sum() / y_predicted.shape[0] #calculate fraction of outliers

    if frac_outliers < 0.5: #if a majority of the points are not outliers
        #update the model
        for row in X_update.iterrows():
            centroid = centroid + (1 / n) * (row[1].values - centroid)
            
    return centroid, r, n, y_predicted

# Model B: One Class Support Vector Machine
The second model is a one class SVM algorithm. See ```Method 2 - One Class SVM.ipynb``` for a more detailed explanation of the algorithm, as well as development.

In [None]:
def model_B_initialize(X_train):
    """
    This algorithm implements the inital training for the OneClassSVM() algorithm from scikit learn. Hyperparameters
    were optimized previously to nu = 0.1 and kernel = 'linear'.
    
    Inputs:
        X_train: A DataFrame of features. All features are input into the model.
        
    Output:
        model: The OneClassSVM() model object with stored fit results.
        n_support_vectors: The number of suport vectors used by the algorithm.
    """
    #imports
    import pandas as pd
    from sklearn import svm
    
    #make sure X_train is mean normalized
    X_train = (X_train - X_train.mean()) / (X_train.std())

    # create model
    model = svm.OneClassSVM(nu=0.1, kernel='linear') #hyperparameters found through validation

    # fit the model
    model.fit(X_train)
    
    #extract the number of support vectors
    support_vectors = model.support_vectors_
    n_support_vectors = support_vectors.shape[0]

    return model, n_support_vectors

In [None]:
def model_B_update(X_update, model, n_support_vectors):
    """
    This function updates the OneClassSVM() algorithm with new data. It uses the support vectors from the previous
    model and the new data to train a new model. This data set is much smaller than the original, so training is 
    faster. nu is also set to be n_support_vectors / (n_support_vectors + n_new_data) so that the number of support
    vectors stays roughly constant. n_support_vectors is held constant across updates, but the support vectors are
    updated every time.
    
    Inputs:
        X_update: a DataFrame of features. All features will be used in the algorithm.
        model: The OneClassSVN() model object from the previous training. The model must include fit results.
        n_support_vectors: The number of support vectors generated in the initial training
        
    Outputs:
        model_update: The updated OneClassSVM() model object with stored fit results.
        y_predict: The predictions for each data point. 0=normal, 1=outlier.
    """
    #imports
    import pandas as pd
    from sklearn import svm
    
    #extract the support vectors
    support_vectors = model.support_vectors_

    X_update = (X_update - X_update.mean()) / (X_update.std())

    #see how many are outliers
    y_predict = model.predict(X_update)

    # OneClassSVM() outputs 1 for outliers and -1 for normal
    y_predict[y_predict==-1] = 0
    frac_outliers = y_predict.sum() / y_predict.shape[0]

    if frac_outliers < 0.5:
        n_new_data = X_update.shape[0]

        #combine the support vectors and the new data points
        X_new = np.vstack((support_vectors, X_update.values))

        #recalculate nu
        nu = n_support_vectors / (n_support_vectors + n_new_data)

        #fit the new data
        model_update = svm.OneClassSVM(nu=nu, kernel='linear')
        model_update.fit(X_new)
        
    return model_update, y_predict