In [1]:
# Imports for project purposes
# Full Project imports
import pandas as pd
import math as mt
import dateutil
from datetime import datetime, timedelta
import requests as rd
import numpy as np
from sklearn import neighbors, decomposition
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import smtplib
import scipy.stats as st
import os
from datetime import datetime, timedelta
from pandas.api.types import is_numeric_dtype
import copy
from sklearn.model_selection import train_test_split

In [2]:
class Anomaly:
    '''Base Class for an anomaly detection method'''
    data = None
    trainDf = None
    testDf = None
    
    # Constructor to set values for data
    def __init__(self, input_data = None):
        '''
        Constructor for setting dataset reference to a specific dataset
        
        Parameters
        ----------
        input_data: Pandas DataFrame reference - Your dataset in the form of a Pandas DataFrame
        '''
        # Ensuring data is properly formatted
        assert input_data is None or type(input_data) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        self.data  = input_data
        
        
    # Loading data into project
    def load_html(self, link: str) -> pd.DataFrame():
        '''
        Loads an HTML table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_html(link)
        return self.data    
    
    
    # Loading data into project
    def load_csv(self, link: str) -> pd.DataFrame():
        '''
        Loads an CSV table and sets it as the dataset for the model. 
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_csv(link)
        return self.data
    
    
    #Loading data into project
    def load_excel(self, link: str) -> pd.DataFrame():
        '''
        Loads an Exel table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_excel(link)
        return self.data
    
    
    #Loading data into project
    def load_sql_table(self, link: str) -> pd.DataFrame():
        '''
        Loads a SQL table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_sql_table(link)
        return self.data
    
    
    # Setter for the training set
    def set_train(self, trainingSet: pd.DataFrame):
        '''
        A setter for the training set.
        
        Parameters
        ----------
        trainingSet: pandas DataFrame - A DataFrame object that will serve as your training set
        '''
        self.trainDf = trainingSet
    
    
    # Setter for the test set
    def set_test(self, testSet: pd.DataFrame):
        '''
        A setter for the test set.
        
        Parameters
        ----------
        testSet: pandas DataFrame - A DataFrame object that will serve as your training set
        '''
        self.testDf = testSet
        
    
    # Randomly split train and test set
    def assign_train_test(self,random_state = 42, training_set_ratio = 0.8, shuffling = True):
        '''
        A default random splitter into train and test set
        
        Parameters
        ----------
        random_state : int - determines random state fed to model for reproducability of random results, default is 42
        training_set_ratio: float between 0.0 and 1.0 - what % of your data you would like to encompass the training set (test set will be made in complimentary way) default is 0.8
        shuffling: boolean - whether or not you would like your data randomly shuffled out of chronology prior to split (True/False). Default is True.
        '''
        # Ensuring that data actually exists before splitting
        assert not(self.data is None), "You cannot assign a train and test set out of a dataset that has not been initialized"
        
        # Splitting into train and test
        self.trainDf, self.testDf = train_test_split(self.data, train_size = training_set_ratio, shuffle = shuffling)
        return self.trainDf, self.testDf
    

In [5]:
class Bootstrap(Anomaly):
    '''A class for returning anomaly of categorical column counts, utilizing the metric of surprise (entropy)'''
    data = None
    trainDf = None
    testDf = None
    timestamp = None
    params = None
    
    # Overloaded constructor in case user doesn't want to fit data right away
    def __init__(self, timeCol = 'date_time', resamples = 1000, maxTrainingSizeMult = 10, maxCategory = 100, minCategories = 10):
        '''
        Constructor which does not require immediate fit to model, merely initializes timestamp if given
        
        Parameters
        ----------
        timeCol: String - The name of the primary TimeStamp column
        resamples: int - the number of times the bootstrap resamples. Making this very large will improve accuracy but significantly lower speed. Default = 1000
        maxTrainingSizeMult: int - If there is more than x  = maxTrainingSizeMult ratio of training to test data, trim training data to most recent. Default = 10
        maxCategory: int - Maximum number of categories in a column (to ensure that counts are not tiny and are meaninful), column skipped if value count higher than this. Default = 100
        minCategory: int - if column has a category count that is lower than this value, don't report it in bootstrap surprise. Default = 10.
        '''
        # Initializing time
        timestamp = timeCol
        
        # Meta-parameter initialization
        params = {
          "bootstrapResamples": resamples,
          "maxTrainingSizeMultiple":maxTrainingSizeMult, # if there is more than X times more training data, trim to most recent
          "maxCategories":maxCategory,
          "minCategoryCount": minCategories,
        }
    
    
    # Fot fitting data right away
    def __init__(self, dataset, timeCol = "date_time",  resamples = 1000, maxTrainingSizeMult = 10, maxCategory = 100, minCategories = 10):
        '''
        Overloaded constructor for attaching dataset immediately, can be done independently within any of the load functions

        Parameters
        ----------
        dataset: String - A pandas data frame reference
        timeCol: String - The name of the primary TimeStamp column. Default = "date_time".
        resamples: int - the number of times the bootstrap resamples. Making this very large will improve accuracy but significantly lower speed. Default = 1000
        maxTrainingSizeMult: int - If there is more than x  = maxTrainingSizeMult ratio of training to test data, trim training data to most recent. Default = 10
        maxCategory: int - Maximum number of categories in a column (to ensure that counts are not tiny and are meaninful), column skipped if value count higher than this. Default = 100
        minCategory: int - if column has a category count that is lower than this value, don't report it in bootstrap surprise. Default = 10.
        '''
        timestamp = timeCol
        data = dataset
        
        # Meta-parameter initialization
        params = {
          "bootstrapResamples": resamples,
          "maxTrainingSizeMultiple":maxTrainingSizeMult, # if there is more than X times more training data, trim to most recent
          "maxCategories":maxCategory,
          "minCategoryCount": minCategories,
        }
        
    
    # Converts Timetamp column of DataFrame to a legitimate timestamp
    def convert_time_stamp_to_datetime(self: str, formatting = '%Y%m%d %H:%M:%S') -> pd.DataFrame:
        '''
        Converts a chosen timestamp column from string to date/time, making the modifications both to the fitted
        Data Frame and returning the new Data Frame
        
        Parameters
        ----------
        timestamp: String - The name of the Timestamp column that needs conversion
        formatting: String - If formatting different from default = %Y%m%d %H:%M:%S, enter the format of your TimeSeries column
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the locally the entire DataFrame with the modified Timestamp column
        '''
        self.data[timestamp] =  pd.to_datetime(self.data[timestamp], format = formatting)
        return self.data
    
    
    # Splits data into train and test set based on date/time
    def split_train_test_by_time(batchHours = 24*7):
        '''
        Splits Data into a train and test set, held within the object
        
        Parameters
        ----------
        batchHours: int - Size of the test set in terms of hours. Default is one week (24 * 7).
        '''
        maxTs = max(self.data[timestamp])
        batchTs = maxTs - timedelta(hours = batchHours)
        self.testDf = self.data[self.data[timestamp] > batchTs]
        self.trainDf = self.data[self.data[timestamp] < batchTs]
        
 
    # Helpers and Math
    def pValue(self,data, threshold: np.number, result: pd.DataFrame) -> np.array:
        '''
        Returns the p-value of a computation
        
        Parameters
        ----------
        data: Pandas DataFrame - The Data we are computing the P-value on
        threshold: np.number - The threshold to check if data is anomalous
        result: pd.DataFrame - A DataFrame containing the column "Bootstrap counts" to be normalized and tested for anomaly
        
        Returns
        -------
        pGauss : np.array
            Returns the array of normalized p-values for each bootstrap count
        '''
        # Taking the smaller of the 2 p-values(either could present large anomaly)
        pLarger = sum(np.array(data) >= threshold) / len(data)
        pSmaller = sum(np.array(data) <= threshold) / len(data)
        p = min(pLarger, pSmaller)

        # only use gaussian p-value when there is variation, but bootsrap p = 0
        stdev = np.std(data)
        if stdev == 0 or p != 0:
            pGauss = p
        else:
            # Normalizing
            pGauss = st.norm(np.mean(result['bootstrap_counts']), stdev).cdf(result['count'])
            pGauss = min(pGauss,1-pGauss)
        return pGauss

    
    def trimFrame(self,df: pd.DataFrame) -> pd.DataFrame:
        '''
        Trims a DataFrame, ensuring that it does not exceed the training set max size hyper parameter
        
        Parameters
        ----------
        df: pandas DataFrame - The DataFrame that is being trimmed to fit to the training set hyperparameter
        
        Returns
        -------
        dfTrimmed : pandas DataFrame
            Returns a DataFrame fit to the training set specifications
        '''
        # trim to most recent
        df = df.sort_values(self.timestamp, ascending =False)
        dfTrimmed = df[:self.params['maxTrainingSizeMultiple']*len(testDf)]

        return dfTrimmed
    
    
    # Returns names of categorical columns
    def getCategoricalColumnNames(df: pd.DataFrame) -> []:
        '''
        Returns the names of categorical columns in a Pandas DataFrame (if the type is a string)
        
        Parameters
        ----------
        df: pandas DataFrame - The DataFrame whose columns are checked for being categorical data
        
        Returns
        -------
        columnNames : list
            The list of all categorical column names 
        '''
        columnNames = []
        for columnName in df.keys():
            if (type (df[columnName].iloc[0])) == str:
                columnNames.append(columnName)
        return columnNames
    
    
    def train_test_anomaly(self) -> pd.DataFrame:
        '''
        Tests for difference between training and test set counts, returning a report that quantifies difference between
        training and test set as surprise.
        
        Returns
        -------
        resultsDf : pandas DataFrame
            A DataFrame containing a report for the difference between expected and detected counts within the test set 
            With the inclusion of a column quantifying irregularity as surprise (entropy)
        
        '''
        # get all of the string columns
        columnNames = self.getCategoricalColumnNames(self.testDf)

        bootstrapDf = self.trimFrame(self.trainDf)

        # set up dict, add counts
        results = {}


        for columnName in columnNames:

            # if it isn't a string column, reject it
            if type(testDf[columnName].iloc[0]) != str:
                continue
            categories = (bootstrapDf[columnName].append(self.testDf[columnName])).unique()
            if len(categories) > self.params['maxCategories']:
                continue

            results[columnName] = {}
            testCounts = self.testDf[columnName].value_counts(dropna = False)
            
            
            for i in np.arange(1,len(categories) -1):
                if(pd.isna(categories[i])):
                    categories = np.delete(categories, i)  
            for category in categories:
                results[columnName][category] = {'bootstrap_counts':[],

                                                 'count':testCounts.get(category,0)}
        # resample, add boostrap counts
        for ii in range(params['bootstrapResamples']):

            # Draw random sample from training
            sampleDf = bootstrapDf.sample(len(testDf), replace=True)
            for columnName in results.keys():

                # count by category
                trainCounts = sampleDf[columnName].value_counts(dropna = False)

                # put results in dict
                for category in results[columnName].keys():
                    boostrapCount = trainCounts.get(category,0)
                    results[columnName][category]['bootstrap_counts'].append(boostrapCount)

        # convert to records, add p-values
        bootstrap_results = []
        for columnName in results.keys():
            for category in results[columnName].keys():
                result = results[columnName][category]

                estimatedCount = int(np.round(np.mean(result['bootstrap_counts'])))

                # don't report entries with very low predicted and actual counts
                if estimatedCount < params['minCategoryCount'] and result['count'] < params['minCategoryCount']:
                    continue

                p = pValue(result['bootstrap_counts'],result['count'], result)
                categoryName = category

                # Backup
                if not category:
                    categoryName = "NULL"

                bootstrap_results.append({"column":columnName,
                                   "category":categoryName,
                                   "count":result['count'],
                                   "p": p,
                                   "estimated_count":estimatedCount,
                                   })

        # Sorting by P-values and obtaining Surprise of each
        if(np.count_nonzero(p)>0):
            resultsDf = pd.DataFrame.from_records(bootstrap_results).sort_values('p')
            resultsDf['surprise'] = -np.log2(resultsDf['p'])

            return resultsDf
        

In [3]:
class TimeSeries(Anomaly):
    '''
    Utilizes facebook prophet and its ability to predict the future based off specific time context (day, hour, holiday)
    to make predictions and test those against the dataset, thus finding anomaly with the context of time
    '''
    data = None
    trainDf = None
    testDf = None
    
    def __init__(self,inp_data = None):
        from fbprophet import Prophet
        data  = inp_data
            
    def truncateTs(ts):
        return ts.replace(minute=0, second=0,  microsecond=0)
    
    def group_and_build_time_table(truncatedData):
        groupedCounts = truncatedData.value_counts()
        prophetDf = pd.DataFrame({'ds':groupedCounts.index,'y':np.log10(groupedCounts.values)})
        return prophetDf
    
    # Takes in the the dataset and the prophet dataset returned by the ast option
    def train_model_on_country(testDf, prophetDf, country = "US"):
        # Train model
        m = Prophet(#daily_seasonality = True, 
                    #yearly_seasonality = False, 
                    #weekly_seasonality = True, 
                    #growth='linear',
                    interval_width=0.68 # one sigma
                   )
        m.add_country_holidays(country_name=country)

        m.fit(prophetDf)
        return m
    
    # Splits data into train and test set based on date/time
    def split_train_test_by_time(batchHours = 24*7):
        '''
        Splits Data into a train and test set, held within the object
        
        Parameters
        ----------
        batchHours: int - Size of the test set in terms of hours. Default is one week (24 * 7).
        '''
        maxTs = max(self.data[timestamp])
        batchTs = maxTs - timedelta(hours = batchHours)
        self.testDf = self.data[self.data[timestamp] > batchTs]
        self.trainDf = self.data[self.data[timestamp] < batchTs]
        

    # Applies Prophet analytics to create a forecast based on hours
    def predict_future(testDf,m, timestamp = "date_time"):

        # Takes in trained model and predicts the future
        # find number of hours to preduct: ceil of hours in testDf
        testDf = testDf.assign(ts = testDf.get(timestamp))

        #If a column is string, convert to date/time
        if(testDf.applymap(type).eq(str).any()['ts']):
            testDf['ts'] = pd.to_datetime(testDf['ts'])

        timeDelta = max(testDf['ts']) -min(testDf['ts'])
        hours = int(timeDelta.days*24 + timeDelta.seconds/(60*60))+1
        future = m.make_future_dataframe(periods = hours, freq = 'H')
        forecast = m.predict(future)
        return forecast, testDf

    def find_surprise(truncatedData, forecast):
    groupedCounts = truncatedData.value_counts()

    prophetTestDf = pd.DataFrame({'ds':groupedCounts.index,
                                  'y':np.log10(groupedCounts.values),
                                  'y_linear':groupedCounts.values})

    # find p-value
    prophet_results = []

    # Comparing test and training set data for identical intervals
    for ii in range(len(prophetTestDf)):
        ts = prophetTestDf['ds'][ii]
        fcstExample = forecast[forecast['ds'] == ts]
        mean = fcstExample['yhat'].iloc[0]
        stdev = (fcstExample['yhat_upper'].iloc[0] - fcstExample['yhat_lower'].iloc[0])/2
        
        # Calculating the P-value
        p = st.norm(mean, stdev).cdf(prophetTestDf['y'][ii])
        p = min(p,1-p)

        prophet_results.append({"column":"Forecast",
                           "category":str(ts),
                           "count":prophetTestDf['y_linear'][ii],
                           "p": p,
                           "estimated_count":int(np.round(np.power(10,mean))),
                           })
    
    # Obtaining Entropy of Time-Series values
    prophetResultsDf = pd.DataFrame.from_records(prophet_results).sort_values('p')
    prophetResultsDf['surprise'] = -np.log2(prophetResultsDf['p'])
    return prophetResultsDf

    # Takes in a model that has been trained on country, plots graphs for visualization
    def visualize(m, forecast):
        # Model visualization
        fig = m.plot(forecast)
        fig = m.plot_components(forecast)
    

    
        

In [4]:
class KernelPCA(Anomaly):
    '''Combines Kernel Density and PCA into a join proccess that runs on all numerical columns to triangulate outliers'''
    def __init__(self,x):
        print(x)

In [5]:
class Kernel(Anomaly):
    '''Column-based numerical outlier tester that utilizes fitting a Kernel and obtaining a density estimation'''
    def __init__(self,x):
        print(x)

In [6]:
class PCA(Anomaly):
    '''Row-based outlier techniques that utilizes dimensionality reduction to understand systematic bias by row'''
    def __init__(self,x):
        print(x)

In [7]:
class Categorical(Anomaly):
    '''Uses dynamically built data "grammar conventions" to find outliers based on defiance of strict structures'''
    def __init__(self,x):
        print(x)

In [12]:
class MultiDimCategorical(Anomaly):
    '''
    Utilizes the idea of mutual entropy to build first order and 2nd order approximations for a 
    given column based on randomly chosen/handpicked context
    '''
    def __init__(self,x):
        print(x)

In [11]:
class RunInitial:
    '''A simple base class for an initial non-verbose outlier scan'''
    data = None
    def __init__(self,x):
        print(x)
        
    def categorical_column_anomaly():
        pass
    
    def numerical_column_anomaly():
        pass
        
    def date_time_anomaly():
        pass
    
    def obtainAnomaly():
        pass

In [13]:
class Report(RunInitial):
    '''An extention of the RunInitial class that offers a more verbose and visual report for an initial anomaly scan'''
    data = None
    
    def __init__(self,x):
        
    def metadata(self):
        pass
    
    def design_report(self):
        pass
        
    def report_to_excel(self):
        pass
    
    def visualize_pca(self):
        pass
    
    def visualize_kernel_density(self):
        pass
    
    def visualize_decision_tree(self):
        pass
        