In [1]:
# Imports for project purposes
# Full Project imports
import pandas as pd
import math as mt
import dateutil
from datetime import datetime, timedelta
import requests as rd
import numpy as np
from sklearn import neighbors, decomposition
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import smtplib
import scipy.stats as st
import os
from datetime import datetime, timedelta
from pandas.api.types import is_numeric_dtype
import copy
from sklearn.model_selection import train_test_split

In [2]:
class Anomaly:
    '''Base Class for an anomaly detection method'''
    data = None
    trainDf = None
    testDf = None
    
    # Constructor to set values for data
    def __init__(self, input_data = None):
        '''
        Constructor for setting dataset reference to a specific dataset
        
        Parameters
        ----------
        input_data: Pandas DataFrame reference - Your dataset in the form of a Pandas DataFrame
        '''
        # Ensuring data is properly formatted
        assert input_data is None or type(input_data) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        self.data  = input_data
        
        
    # Loading data into project
    def load_html(self, link: str) -> pd.DataFrame():
        '''
        Loads an HTML table and sets it as the dataset for the model.
        
        Common issues: inputting an invalid file path (your file will not be read if this is the case),
        linking another file format (ensure that your link is indeed a link to a website with tables), or giving a 
        link to a website which does not allow scraping of its information.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_html(link)
        return self.data    
    
    
    # Loading data into project
    def load_csv(self, link: str) -> pd.DataFrame():
        '''
        Loads an CSV table and sets it as the dataset for the model. 
        
        Common issues: Incorrect file path (ensure your file path is valid), a failure to enter a valid CSV
        (ensure your file is in CSV format)
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_csv(link)
        return self.data
    
    
    #Loading data into project
    def load_excel(self, link: str) -> pd.DataFrame():
        '''
        Loads an Exel table and sets it as the dataset for the model.
        
        Common issues: Incorrect file path (ensure your file path is valid), a failure to enter a valid Excel
        (ensure your file is in Excel format), Random spaces within your data (A random space within an Excel file might be read as an NaN value)
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_excel(link)
        return self.data
    
    
    #Loading data into project
    def load_sql_table(self, link: str) -> pd.DataFrame():
        '''
        Loads a SQL table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_sql_table(link)
        return self.data
    
    
    # Setter for the training set
    def set_train(self, trainingSet: pd.DataFrame):
        '''
        A setter for the training set.
        
        Parameters
        ----------
        trainingSet: pandas DataFrame - A DataFrame object that will serve as your training set
        '''
        self.trainDf = trainingSet
    
    
    # Setter for the test set
    def set_test(self, testSet: pd.DataFrame):
        '''
        A setter for the test set.
        
        Parameters
        ----------
        testSet: pandas DataFrame - A DataFrame object that will serve as your training set
        '''
        self.testDf = testSet
        
    
    # Randomly split train and test set
    def assign_train_test(self,random_state = 42, training_set_ratio = 0.8, shuffling = True):
        '''
        A default random splitter into train and test set
        
        Parameters
        ----------
        random_state : int - determines random state fed to model for reproducability of random results, default is 42
        training_set_ratio: float between 0.0 and 1.0 - what % of your data you would like to encompass the training set (test set will be made in complimentary way) default is 0.8
        shuffling: boolean - whether or not you would like your data randomly shuffled out of chronology prior to split (True/False). Default is True.
        '''
        # Ensuring that data actually exists before splitting
        assert not(self.data is None), "You cannot assign a train and test set out of a dataset that has not been initialized"
        
        # Splitting into train and test
        self.trainDf, self.testDf = train_test_split(self.data, train_size = training_set_ratio, shuffle = shuffling)
        return self.trainDf, self.testDf
    

In [11]:
class Bootstrap(Anomaly):
    '''A class for returning anomaly of categorical column counts, utilizing the metric of surprise (entropy)'''
    data = None
    trainDf = None
    testDf = None
    timestamp = None
    params = None
    
    # Overloaded constructor in case user doesn't want to fit data right away
    def __init__(self, timeCol = 'date_time', resamples = 1000, maxTrainingSizeMult = 10, maxCategory = 100, minCategories = 10):
        '''
        Constructor which does not require immediate fit to model, merely initializes timestamp if given
        
        Parameters
        ----------
        timeCol: String - The name of the primary TimeStamp column
        resamples: int - the number of times the bootstrap resamples. Making this very large will improve accuracy but significantly lower speed. Default = 1000
        maxTrainingSizeMult: int - If there is more than x  = maxTrainingSizeMult ratio of training to test data, trim training data to most recent. Default = 10
        maxCategory: int - Maximum number of categories in a column (to ensure that counts are not tiny and are meaninful), column skipped if value count higher than this. Default = 100
        minCategory: int - if column has a category count that is lower than this value, don't report it in bootstrap surprise. Default = 10.
        '''
        # Initializing time
        timestamp = timeCol
        
        # Meta-parameter initialization
        params = {
          "bootstrapResamples": resamples,
          "maxTrainingSizeMultiple":maxTrainingSizeMult, # if there is more than X times more training data, trim to most recent
          "maxCategories":maxCategory,
          "minCategoryCount": minCategories,
        }
    
    
    # Fot fitting data right away
    def __init__(self, dataset: pd.DataFrame = None, timeCol: str = "date_time",  resamples: int = 1000, maxTrainingSizeMult: int = 10, maxCategory: int = 100, minCategories: int = 10):
        '''
        Overloaded constructor for attaching dataset immediately, can be done independently within any of the load functions

        Parameters
        ----------
        dataset: String - A pandas data frame reference
        timeCol: String - The name of the primary TimeStamp column. Default = "date_time".
        resamples: int - the number of times the bootstrap resamples. Making this very large will improve accuracy but significantly lower speed. Default = 1000
        maxTrainingSizeMult: int - If there is more than x  = maxTrainingSizeMult ratio of training to test data, trim training data to most recent. Default = 10
        maxCategory: int - Maximum number of categories in a column (to ensure that counts are not tiny and are meaninful), column skipped if value count higher than this. Default = 100
        minCategory: int - if column has a category count that is lower than this value, don't report it in bootstrap surprise. Default = 10.
        '''
        self.timestamp = timeCol
        self.data = dataset
        
        # Meta-parameter initialization
        params = {
          "bootstrapResamples": resamples,
          "maxTrainingSizeMultiple":maxTrainingSizeMult, # if there is more than X times more training data, trim to most recent
          "maxCategories":maxCategory,
          "minCategoryCount": minCategories,
        }
        
    
    # Converts Timetamp column of DataFrame to a legitimate timestamp
    def convert_time_stamp_to_datetime(self, formatting: str = '%Y%m%d %H:%M:%S') -> pd.DataFrame:
        '''
        Converts a chosen timestamp column from string to date/time, making the modifications both to the fitted
        Data Frame and returning the new Data Frame
        
        Parameters
        ----------
        timestamp: String - The name of the Timestamp column that needs conversion
        formatting: String - If formatting different from default = %Y%m%d %H:%M:%S, enter the format of your TimeSeries column
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the locally the entire DataFrame with the modified Timestamp column
        '''
        self.data[timestamp] =  pd.to_datetime(self.data[timestamp], format = formatting)
        return self.data
    
    
    # Splits data into train and test set based on date/time
    def split_train_test_by_time(batchHours: int = 24*7):
        '''
        Splits Data into a train and test set, held within the object
        
        Parameters
        ----------
        batchHours: int - Size of the test set in terms of hours. Default is one week (24 * 7).
        '''
        maxTs = max(self.data[timestamp])
        batchTs = maxTs - timedelta(hours = batchHours)
        self.testDf = self.data[self.data[timestamp] > batchTs]
        self.trainDf = self.data[self.data[timestamp] < batchTs]
        
 
    # Helpers and Math
    def pValue(self,data, threshold: np.number, result: pd.DataFrame) -> np.array:
        '''
        Returns the p-value of a computation
        
        Parameters
        ----------
        data: Pandas DataFrame - The Data we are computing the P-value on
        threshold: np.number - The threshold to check if data is anomalous
        result: pd.DataFrame - A DataFrame containing the column "Bootstrap counts" to be normalized and tested for anomaly
        
        Returns
        -------
        pGauss : np.array
            Returns the array of normalized p-values for each bootstrap count
        '''
        # Taking the smaller of the 2 p-values(either could present large anomaly)
        pLarger = sum(np.array(data) >= threshold) / len(data)
        pSmaller = sum(np.array(data) <= threshold) / len(data)
        p = min(pLarger, pSmaller)

        # only use gaussian p-value when there is variation, but bootsrap p = 0
        stdev = np.std(data)
        if stdev == 0 or p != 0:
            pGauss = p
        else:
            # Normalizing
            pGauss = st.norm(np.mean(result['bootstrap_counts']), stdev).cdf(result['count'])
            pGauss = min(pGauss,1-pGauss)
        return pGauss

    
    def trimFrame(self,df: pd.DataFrame) -> pd.DataFrame:
        '''
        Trims a DataFrame, ensuring that it does not exceed the training set max size hyper parameter
        
        Parameters
        ----------
        df: pandas DataFrame - The DataFrame that is being trimmed to fit to the training set hyperparameter
        
        Returns
        -------
        dfTrimmed : pandas DataFrame
            Returns a DataFrame fit to the training set specifications
        '''
        # trim to most recent
        df = df.sort_values(self.timestamp, ascending =False)
        dfTrimmed = df[:self.params['maxTrainingSizeMultiple']*len(testDf)]

        return dfTrimmed
    
    
    # Returns names of categorical columns
    def getCategoricalColumnNames(df: pd.DataFrame) -> []:
        '''
        Returns the names of categorical columns in a Pandas DataFrame (if the type is a string)
        
        Parameters
        ----------
        df: pandas DataFrame - The DataFrame whose columns are checked for being categorical data
        
        Returns
        -------
        columnNames : list
            The list of all categorical column names 
        '''
        columnNames = []
        for columnName in df.keys():
            if (type (df[columnName].iloc[0])) == str:
                columnNames.append(columnName)
        return columnNames
    
    
    def train_test_anomaly(self) -> pd.DataFrame:
        '''
        Tests for difference between training and test set counts, returning a report that quantifies difference between
        training and test set as surprise.
        
        Returns
        -------
        resultsDf : pandas DataFrame
            A DataFrame containing a report for the difference between expected and detected counts within the test set 
            With the inclusion of a column quantifying irregularity as surprise (entropy)
        
        '''
        # Preventative measures
        assert self.trainDf != None and self.testDf != None, "Please set up your train and test sets prior to attempting this step"
        
        # get all of the string columns
        columnNames = self.getCategoricalColumnNames(self.testDf)

        bootstrapDf = self.trimFrame(self.trainDf)

        # set up dict, add counts
        results = {}


        for columnName in columnNames:

            # if it isn't a string column, reject it
            if type(testDf[columnName].iloc[0]) != str:
                continue
            categories = (bootstrapDf[columnName].append(self.testDf[columnName])).unique()
            if len(categories) > self.params['maxCategories']:
                continue

            results[columnName] = {}
            testCounts = self.testDf[columnName].value_counts(dropna = False)
            
            
            for i in np.arange(1,len(categories) -1):
                if(pd.isna(categories[i])):
                    categories = np.delete(categories, i)  
            for category in categories:
                results[columnName][category] = {'bootstrap_counts':[],

                                                 'count':testCounts.get(category,0)}
        # resample, add boostrap counts
        for ii in range(params['bootstrapResamples']):

            # Draw random sample from training
            sampleDf = bootstrapDf.sample(len(testDf), replace=True)
            for columnName in results.keys():

                # count by category
                trainCounts = sampleDf[columnName].value_counts(dropna = False)

                # put results in dict
                for category in results[columnName].keys():
                    boostrapCount = trainCounts.get(category,0)
                    results[columnName][category]['bootstrap_counts'].append(boostrapCount)

        # convert to records, add p-values
        bootstrap_results = []
        for columnName in results.keys():
            for category in results[columnName].keys():
                result = results[columnName][category]

                estimatedCount = int(np.round(np.mean(result['bootstrap_counts'])))

                # don't report entries with very low predicted and actual counts
                if estimatedCount < params['minCategoryCount'] and result['count'] < params['minCategoryCount']:
                    continue

                p = pValue(result['bootstrap_counts'],result['count'], result)
                categoryName = category

                # Backup
                if not category:
                    categoryName = "NULL"

                bootstrap_results.append({"column":columnName,
                                   "category":categoryName,
                                   "count":result['count'],
                                   "p": p,
                                   "estimated_count":estimatedCount,
                                   })

        # Sorting by P-values and obtaining Surprise of each
        if(np.count_nonzero(p)>0):
            resultsDf = pd.DataFrame.from_records(bootstrap_results).sort_values('p')
            resultsDf['surprise'] = -np.log2(resultsDf['p'])

            return resultsDf
        

In [6]:
from fbprophet import Prophet

class TimeSeries(Anomaly):
    '''
    Utilizes facebook prophet and its ability to predict the future based off specific time context (day, hour, holiday)
    to make predictions and test those against the dataset, thus finding anomaly with the context of time. Please ensure 
    you set your train and test sets prior to computation.
    '''
    data = None
    trainDf = None
    testDf = None
    prophetDf = None
    countryModel = None
    forecast = None
    timestamp = None
    
    def __init__(self,timeStampInput = 'date_time', inp_data = None, train = None,test = None):
        '''
        Does not require any inputs, but gives user option to initialize input data/train/test right from the get-go
        
        Parameters
        ----------
        inp_data: pandas DataFrame - A given data set
        train: pandas DataFrame - A given training set
        test: pandas DataFrame - A given test set
        '''
        
        # Ensuring that if user has given us input, it is of the correct form
        assert input_data is None or type(input_data) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        data  = inp_data
        
        # Ensuring training set is of the correct form
        assert train is None or type(train) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        trainDf  = train
        
        # Ensuring test set is of the correct form
        assert test is None or type(test) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        testDf = test
        
        
        timestamp = timeStampInput
        
        
            
    def truncateTs(ts: pd.Series) -> pd.Series:
        '''
        Truncates a timestamp column to the hour percision (minute, second, and microsecond all set to 0)
        
        Parameters
        ----------
        ts: pandas Series - A given Timestamp column to be truncated
        
        Returns
        -------
        ts : pandas Series
            The Timestamp column truncated to the hour percision
        '''
        return ts.replace(minute=0, second=0,  microsecond=0)
    
    
    def group_and_build_time_table(self,truncated: bool = False) -> pd.DataFrame:
        '''
        Builds a table on the basis of the value counts (or rather the log 10 of the value counts)
        
        Parameters
        ----------
        truncated: boolean - A value representing whether or not the training set has already been truncated to the hour. Default is False. If this is the case, the data will be autotruncated. 
        
        Returns
        -------
        prophetDf : pandas Data Frame
                   A table with value grouping by value counts
        
        '''
        # Making a copy as to not mess up reference
        truncatedData  = self.trainDf.copy()
        
        # Truncating timestamp if needed
        if truncated == False:
            truncatedData[timestamp] = truncateTs(truncatedData[timestamp])
        groupedCounts = truncatedData.value_counts()
        
        # Grouping counts in a single DataFrame
        self.prophetDf = pd.DataFrame({'ds':groupedCounts.index,'y':np.log10(groupedCounts.values)})
        return self.prophetDf
    
    
    # Takes in the the dataset and the prophet dataset returned by the ast option
    def train_model_on_country(self, country: str = "US"):
        '''
        Trains a Facebook Prophet model on a specified country with a linear growth algorithm and an interval
        width of one sigma. Default country is the United States. Will fit this country model onto TimeSeries table.
        
        Common issues: Downloading Prophet can be very messy and certain modifications might need to be made to 
        enable holidays such as Easter. You can read more about this issue here: 
        
        https://github.com/facebook/prophet/issues/1293
        
        Parameters
        ----------
        Country: String - The name of a valid country included in the Prophet seasonality package
        
        Returns
        -------
        self.countryModel: Table
                          Facebook Prophet model fitted onto the country of your choice (is also now contained as
                          an instance variable)
        
        '''
        # Ensuring inputted country is a string
        assert type(country) == str, "Given country should be formatted as a string"
        # Train model
        self.countryModel = Prophet(#daily_seasonality = True, 
                    #yearly_seasonality = False, 
                    #weekly_seasonality = True, 
                    #growth='linear',
                    interval_width=0.68 # one sigma
                   )
        self.countryModel.add_country_holidays(country_name=country)

        self.countryModel.fit(prophetDf)
        return self.countryModel
    
    
    # Splits data into train and test set based on date/time
    def split_train_test_by_time(batchHours: int = 24*7):
        '''
        Splits Data into a train and test set, held within the object
        
        Parameters
        ----------
        batchHours: int - Size of the test set in terms of hours. Default is one week (24 * 7).
        '''
        maxTs = max(self.data[timestamp])
        batchTs = maxTs - timedelta(hours = batchHours)
        self.testDf = self.data[self.data[timestamp] > batchTs]
        self.trainDf = self.data[self.data[timestamp] < batchTs]
        

    # Applies Prophet analytics to create a forecast based on hours
    def predict_future(self, timestamp: str = "date_time"):
        '''
        Builds (and returns) a future forecast for comparison to test set (which should be further ahead in time relative 
        to the trainig set). Made on the basis of the number of hours which encompass the test set.
        
        Returns
        -------
        self.forecast: Table
                      A forecast representiative of the predictions the Prophet Model would assume the test set to be,
                      based on the training set.
        '''

        # Takes in trained model and predicts the future
        # find number of hours to preduct: ceil of hours in testDf
        
        # Obtaining interval contained by test set for computation purposes.
        timeDelta = max(self.testDf[timestamp]) - min(self.testDf[timestamp])

        #If a column is string, convert to date/time
        if(testDf.applymap(type).eq(str).any()[timestamp]):
            testDf['ts'] = pd.to_datetime(testDf[timestamp])

        timeDelta = max(testDf[timestamp]) -min(testDf[timestamp])
        hours = int(timeDelta.days*24 + timeDelta.seconds/(60*60))+1
        future = self.countryModel.make_future_dataframe(periods = hours, freq = 'H')
        self.forecast = self.countryModel.predict(future)
        return self.forecast

    
    def train_test_anomaly(self) -> pd.DataFrame:
        '''
        Based on the training-set-reliant prediction of the future, calculates the anomaly between the training and the
        test set utilizing the metric of surprise.
        
        Returns
        -------
        prophetResultsDf: Pandas DataFrame
                      A table containing difference between observed and expected counts, sorted in order of the metric of surprise
                      (How anomalous/chaotic/entropic the data is)
        '''
        groupedCounts = self.trainDf.value_counts()

        prophetTestDf = pd.DataFrame({'ds':groupedCounts.index,
                                      'y':np.log10(groupedCounts.values),
                                      'y_linear':groupedCounts.values})

        # find p-value
        prophet_results = []

        # Comparing test and training set data for identical intervals
        for ii in range(len(prophetTestDf)):
            ts = prophetTestDf['ds'][ii]
            fcstExample = forecast[forecast['ds'] == ts]
            mean = fcstExample['yhat'].iloc[0]
            stdev = (fcstExample['yhat_upper'].iloc[0] - fcstExample['yhat_lower'].iloc[0])/2

            # Calculating the P-value
            p = st.norm(mean, stdev).cdf(prophetTestDf['y'][ii])
            p = min(p,1-p)

            prophet_results.append({"column":"Forecast",
                               "category":str(ts),
                               "count":prophetTestDf['y_linear'][ii],
                               "p": p,
                               "estimated_count":int(np.round(np.power(10,mean))),
                               })

        # Obtaining Entropy of Time-Series values
        prophetResultsDf = pd.DataFrame.from_records(prophet_results).sort_values('p')
        prophetResultsDf['surprise'] = -np.log2(prophetResultsDf['p'])
        return prophetResultsDf

    
    # Takes in a model that has been trained on country, plots graphs for visualization
    def visualize(self):
        '''
        Builds plots for the forecast, displaying its construction on the basis of certain time intervals utilizing the 
        country fitted model self.countryModel
        
        Common issues: this step cannot be completed until you have trained the model on country (train_model_on_country)
        and made a forecast (predict_future). Please complete these prior steps to build the forecast predictions prior to attempting
        to visualize them.
        '''
        # Model visualization
        fig = self.countryModel.plot(self.forecast)
        fig = self.countryModel.plot_components(self.forecast)
        

ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.


In [4]:
class KernelPCA(Anomaly):
    '''Combines Kernel Density and PCA into a join proccess that runs on all numerical columns to triangulate outliers'''
    data = None
    knal = None
    pca = None
    train = None
    test = None
    def __init__(self,x):
        print(x)

In [30]:
class Kernel(Anomaly):
    '''Column-based numerical outlier tester that utilizes fitting a Kernel and obtaining a density estimation'''
    data  = None
    train = None
    test = None
      
    def __init__(self, data_value:pd.DataFrame = None, train_set: pd.DataFrame = None, test_set: pd.DataFrame = None):
        '''
        Object constructor which allows user to set dataset as default and initialize train+test sets. 
        Keep in mind the dataset can also be loaded by means of any method in the base Anomaly class.
        You also don't have to initialize your train and test sets right away and can use our random splitter to do so prior to running the Kernel Density itself!
        
        Parameters
        ----------
        data_value: Pandas DataFrame - The dataset you would like to fit Kernel Density to
        train_set: Pandas DataFrame  - The train set you would like to use for Kernel Density purposes (default = None)
        test_set: Pandas DataFrame  - The test set you would like to use for Kernel Density purposes (default = None)
        '''
        data = data_value
        train = train_set
        test = test_set
        
        # Warning beginning user if train-test balance is off
        if train != None and test != None and train.shape[0] < test.shape[0]:
            print('Warning: training set larger than test set. Could potentially damage results')
        
        
    # Using cosine kernel function to get estimate for log density
    def cosKernel(self, stat: pd.Series) -> np.ndarray:
        '''
        Fits a Cosine Kernel to the data and scores samples by their density.
        Used for distributions with low variability.
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosKernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        cos_density: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        '''
        
        # Ensuring that there are no NaN values that may break the Kernel process
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernelPrep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'cosine').fit(stat)
        cos_density = l.score_samples(stat)
        return cos_density
    
    
    # Using gaussian kernel function to get estimate for log density
    def gaussKernel(self,stat: pd.Series) -> np.ndarray:
        '''
        Fits a Gaussian Kernel to the data and scores samples by their density.
        Used for distributions with standard Gaussian variability.
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosKernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        density: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        '''
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernelPrep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'gaussian').fit(stat)
        density = l.score_samples(stat)
        return density
    
    
    # Using linear kernel function to get estimate for log density
    def expKernel(self,stat: pd.Series)-> np.ndarray:
        '''
        Fits an Exponential (fatty tailed) Kernel to the data and scores samples by their density.
        Used for distributions with high variability.
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosKernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        density: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        '''
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernelPrep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'exponential').fit(stat)
        expDensity = l.score_samples(stat)
        return triDensity
    
    
    # Using epanechnikov kernel function to get estimate for log density
    def parabolicKernel(self,stat: pd.Series) -> np.ndarray:
        '''
        Fits a Parabolic/Epenechnikov Kernel to the data and scores samples by their density. 
        Used for distributions with low variability
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosKernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        epDensity: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        '''
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernelPrep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'epanechnikov').fit(stat)
        epDensity = l.score_samples(stat)
        return epDensity
    
    
    # Specialized column based P-value function: double ended
    def retPVal(self,col: pd.Series) -> pd.Series:
        '''
        Takes a column a Z-score and plugs it into a cdf, returning back a Pandas Series of P-values
        
        Common issues: If Series does not have a set unique index, values will often become NaN's during computation.
        Ensure your Series has an index prior to Series computations to avoid this issue.
        
        Parameters
        ----------
        col: Pandas Series - A column to compute P-values on
        
        Returns
        -------
        P_value: Pandas Series
                A Pandas Series containing the corresponding P-value (by index and in order) to every individual element
                in the input column.
        '''
        # Ensuring only valid values are passed
        assert not col.isnull().values.any(), "retPVal cannot calculate P values on an NaN element, please ensure your input does not contain elements of class np.nan"
        
        #Since we have a normal distribution, starting by obtaining the z-score
        mean = col.mean()
        std = np.std(col)
        centered = col.to_numpy()- mean
        
        #Now obtaining legitimate p-values
        z_scores = centered/std
        for l in np.arange(len(z_scores)):
            cdf = st.norm.cdf(z_scores[l])
            z_scores[l] = min(cdf, 1-cdf)
        return pd.Series(z_scores, index = col.index)
    
    
    # Drops non-numerical and nan values from a table
    def kernelPrep(self)-> pd.DataFrame: 
        '''
        Removes non-numerical columns and NaN from the dataset for proccessing purposes
        
        Returns
        -------
        tabl: pandas DataFrame
            A DataFrame containing the processed data   
        '''
        # Finding all numerical components of the table so that pca can function
        tabl = self.data.select_dtypes(include = [np.number])
        tabl = tabl.dropna(1)
        return tabl
    
    
    # Fits proper Kernel and returns the surprise by element
    def surprise_estimator(self,stat: pd.Series) -> pd.Series:
        '''
        Returns the surprise on a per-value basis of the Kernel density estimation for a particular column.
        
        Common issues: Series should have a set index, one that generalizes to the rest of the dataset. Not having
        such an index will cause trouble when grouping all columns into a single DataFrame.
        
        Parameters
        ----------
        stat: Pandas Series - A Series of numerical values with a set index.
        
        Returns
        -------
        Surprise: Pandas Series
            A Series containing each value index and its corresponding suprise.
        
        '''
        # Surprise Metric
        pVals = retPVal(Kernel_decider(stat))
        Surprise = -np.log2(metric)
        return surprise.sort_values()
    
    
    def kernel_decider(self,stat):
        '''
        Returns Kernel density estimations for a particular column, decided based off the column's variability.
        
        Common issues: Series should have a set index, one that generalizes to the rest of the dataset. Not having
        such an index will cause trouble when grouping all columns into a single DataFrame.
        
        Parameters
        ----------
        stat: Pandas Series - A Series of numerical values with a set index.
        
        Returns
        -------
        metric: Pandas Series
            A Series containing each value index and its corresponding Kernel Density Estimation.
        '''
        # Calculating maximum number of deviations from the mean so as to choose proper Kernel model
        mean = stat.mean()
        dev = stat.std()
        numDevMax = (stat.max() - mean)/dev
        numDevMin = (stat.min() - mean)/dev
        numDev = max(numDevMax, numDevMin)
        
        metric = None
        
        # Assigning appropriate Kernel Estimator on the basis of model's variability
        if(numDev > 3.2):
            metric = expKernel(stat)
        elif((numDev <=3.2) & (numDev >= 2)):
            metric = gaussKernel(stat)
        else:
            metric = parabolicKernel(stat)
            
        return metric

    
    # A grouping of the entire kernel estimation process
    def surprise_on_Table(self, index: str = None) -> pd.DataFrame:
        '''
        Returns Surprise values for a whole table based off Kernel Density estimations (per column).
        
        Common issues: Table should have a set index, or one should be specified for the index argument. Not having
        such an index will cause trouble when grouping all column Estimations into a single DataFrame.
        
        Parameters
        ----------
        index: String - An optional index to set for the new table containing per column suprise: otherwise current set index will be used (default = None)
        
        Returns
        -------
        metric: Pandas Series
            A Series containing each value index and its corresponding Kernel Density Estimation.
        '''
        
        #Preprocessing data
        temp = kernelPrep(self.data)

        # Checking if index given, if it isn't will just use Table's current default index
        if index is not None:
            index = self.data.get(index)
        else:
            index = self.data.index

        #Obtaining surprise of every individual column
        sum_surprise  = pd.Series(np.zeros(self.data.shape[0]))
        for col in temp.columns:
            stat = temp.get(col)
            self.data[col] = surpriseEstimator(index, stat)
            sum_surprise+=self.data[col]

        # Averaging our surprise so we can sort by it
        sum_surprise = sum_surprise.array
        
        self.data = self.data.set_index(index)
        self.data = self.data.assign(mean_surprise = np.round(sum_surprise/Table.shape[1],2))

        # Sorting table for easier visualization
        self.data = self.data.sort_values(by = "mean_surprise", ascending  = False)
        return Table


0     0.031633
1     0.113252
2     0.465579
3     0.465579
4     0.231387
5     0.003703
6     0.113252
7     0.465579
8     0.287223
9     0.465579
10    0.231387
11    0.465579
12    0.465579
13    0.465579
14    0.465579
dtype: float64

In [6]:
class PCA(Anomaly):
    '''Row-based outlier techniques that utilizes dimensionality reduction to understand systematic bias by row'''
    
    def __init__(self,x):
        print(x)

    
    def obtain_variance_table(first_table):
        # Scaling and preparing values for PCA
        tabl = pcaPrep(first_table)
        scaled_data = StandardScaler().fit_transform(tabl)

        # Creating a PCA object 
        pca = PCA(n_components = (tabl.shape[1]))
        pcaData = pca.fit_transform(scaled_data)
        infoFrame = pd.DataFrame().assign(Column = ["PC" + str(i) for i in range(tabl.shape[1])], Variance_ratio = pca.explained_variance_ratio_ )
        return infoFrame
    
    
    def obtainPCAVals(componentNum, scaled_data):
        pca = PCA(n_components = componentNum)
        pcaData = pca.fit_transform(scaled_data)
        return pcaData
    
    
    # Deciding how many columns need to be used: utilizing threashold of 95% of the explained variance
    def elementDecider(infoFrame):
        numSum = 0
        counter = 0

        # Continuing until we have accounted for 95% of the variance
        for i in infoFrame.get("Variance_ratio"):
            if(numSum < .95):
                numSum += i
                counter+=1
        return counter

    
    # Reducing dimensionality of data into pc's, only storing what is neccessary
    def reducedData(infoFrame,  scaled_data, indx):

        numCols = elementDecider(infoFrame)
        pcaData = obtainPCAVals(numCols, scaled_data)
        pcaFrame = pd.DataFrame(pcaData)

        # Dealing with potential index issues
        pcaFrame = pcaFrame.set_index(indx)
        return pcaFrame

    
    # Visualization tool for seeing grouping of elements by pc
    def displayReducedData(pcaVals, xNum = 0, yNum = 1):

        # Ensuring that the elements given do not overacess table 
        if(xNum < pcaVals.shape[1]) & (yNum < pcaVals.shape[1]):
            pcaVals.plot(kind = "scatter", x = 2, y = 3)
        else:
            print("You have overaccessed the number of elements, keep in mind there are only " + str(pcaVals.shape[1]) + " elements")
     
    
    # Master method to run PCA as a whole
    def runPCA(table, index):
        processing_table = pcaPrep(table)
        variance_table = obtain_variance_table(table)
        pcaVals = reducedData(variance_table, StandardScaler().fit_transform(processing_table), table.get(index))
        new_pca = pcaRowOutliers(pcaVals)
        return new_pca
            

In [7]:
class Categorical(Anomaly):
    '''Uses dynamically built data "grammar conventions" to find outliers based on defiance of strict structures'''
    def __init__(self,x):
        print(x)

In [12]:
class MultiDimCategorical(Anomaly):
    '''
    Utilizes the idea of mutual entropy to build first order and 2nd order approximations for a 
    given column based on randomly chosen/handpicked context
    '''
    def __init__(self,x):
        print(x)

In [13]:
class Report(RunInitial):
    '''An extention of the RunInitial class that offers a more verbose and visual report for an initial anomaly scan'''
    data = None
    hyperparams = None
    
    def __init__(self,x):
        
    def metadata(self):
        pass
    
    def design_report(self):
        pass
        
    def report_to_excel(self):
        pass
    
    def visualize_pca(self):
        pass
    
    def visualize_kernel_density(self):
        pass
    
    def visualize_decision_tree(self):
        pass
        