In [14]:
# Imports for project purposes
# Full Project imports
import pandas as pd
import math as mt
import dateutil
from datetime import datetime, timedelta
import requests as rd
import numpy as np
from sklearn import neighbors, decomposition
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import smtplib
import scipy.stats as st
import os
from datetime import datetime, timedelta
from pandas.api.types import is_numeric_dtype
import copy

In [27]:
class Bootstrap:
    '''A class for returning anomaly of categorical column counts, utilizing the metric of surprise (entropy)'''
    data = None
    trainDf = None
    testDf = None
    timestamp = None
    params = None
    
    # Overloaded constructor in case user doesn't want to fit data right away
    def __init__(self, timeCol = 'date_time', resamples = 1000, maxTrainingSizeMult = 10, maxCategory = 100, minCategories = 10):
        '''
        Constructor which does not require immediate fit to model, merely initializes timestamp if given
        
        Parameters
        ----------
        timeCol: String - The name of the primary TimeStamp column
        resamples: int - the number of times the bootstrap resamples. Making this very large will improve accuracy but significantly lower speed. Default = 1000
        maxTrainingSizeMult: int - If there is more than x  = maxTrainingSizeMult ratio of training to test data, trim training data to most recent. Default = 10
        maxCategory: int - Maximum number of categories in a column (to ensure that counts are not tiny and are meaninful), column skipped if value count higher than this. Default = 100
        minCategory: int - if column has a category count that is lower than this value, don't report it in bootstrap surprise. Default = 10.
        '''
        # Initializing time
        timestamp = timeCol
        
        # Meta-parameter initialization
        params = {
          "bootstrapResamples": resamples,
          "maxTrainingSizeMultiple":maxTrainingSizeMult, # if there is more than X times more training data, trim to most recent
          "maxCategories":maxCategory,
          "minCategoryCount": minCategory,
        }
    
    
    # Fot fitting data right away
    def __init__(self, dataset: pd.DataFrame, timeCol = "date_time",  resamples = 1000, maxTrainingSizeMult = 10, maxCategory = 100, minCategories = 10):
        '''
        Overloaded constructor for attaching dataset immediately, can be done independently within any of the load functions

        Parameters
        ----------
        dataset: String - A pandas data frame reference
        timeCol: String - The name of the primary TimeStamp column. Default = "date_time".
        resamples: int - the number of times the bootstrap resamples. Making this very large will improve accuracy but significantly lower speed. Default = 1000
        maxTrainingSizeMult: int - If there is more than x  = maxTrainingSizeMult ratio of training to test data, trim training data to most recent. Default = 10
        maxCategory: int - Maximum number of categories in a column (to ensure that counts are not tiny and are meaninful), column skipped if value count higher than this. Default = 100
        minCategory: int - if column has a category count that is lower than this value, don't report it in bootstrap surprise. Default = 10.
        '''
        timestamp = timeCol
        data = dataset
        
        # Meta-parameter initialization
        params = {
          "bootstrapResamples": resamples,
          "maxTrainingSizeMultiple":maxTrainingSizeMult, # if there is more than X times more training data, trim to most recent
          "maxCategories":maxCategory,
          "minCategoryCount": minCategory,
        }
        
        
    # Loading data into project
    def load_html(self, link: str) -> pd.DataFrame():
        '''
        Loads an HTML table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_html(link)
        return self.data
    
    
    # Loading data into project
    def load_csv(self, link: str) -> pd.DataFrame():
        '''
        Loads an CSV table and sets it as the dataset for the model. 
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_csv(link)
        return self.data
    
    
    #Loading data into project
    def load_exel(self, link: str) -> pd.DataFrame():
        '''
        Loads an Exel table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_exel(link)
        return self.data
    
    
    #Loading data into project
    def load_sql_table(self, link: str) -> pd.DataFrame():
        '''
        Loads a SQL table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        '''
        self.data = pd.read_sql_table(link)
        return self.data
    
    
    # Converts Timetamp column of DataFrame to a legitimate timestamp
    def convert_time_stamp_to_datetime(self: str, formatting = '%Y%m%d %H:%M:%S') -> pd.DataFrame:
        '''
        Converts a chosen timestamp column from string to date/time, making the modifications both to the fitted
        Data Frame and returning the new Data Frame
        
        Parameters
        ----------
        timestamp: String - The name of the Timestamp column that needs conversion
        formatting: String - If formatting different from default = %Y%m%d %H:%M:%S, enter the format of your TimeSeries column
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the locally the entire DataFrame with the modified Timestamp column
        '''
        self.data[timestamp] =  pd.to_datetime(self.data[timestamp], format = formatting)
        return self.data
    
    
    # Splits data into train and test set based on date/time
    def split_train_test(batchHours = 24*7):
        '''
        Splits Data into a train and test set, held within the object
        
        Parameters
        ----------
        batchHours: int - Size of the test set in terms of hours. Default is one week (24 * 7).
        '''
        maxTs = max(df[timestamp])
        batchTs = maxTs - timedelta(hours = batchHours)
        testDf = df[df[timestamp] > batchTs]
        trainDf = df[df[timestamp] < batchTs]
    
    
    # Helpers and Math
    def pValue(self,threshold: np.number, result) -> float:
        # Taking the smaller of the 2 p-values(either could present large anomaly)
        pLarger = sum(np.array(self.data) >= threshold) / len(self.data)
        pSmaller = sum(np.array(self.data) <= threshold) / len(self.data)
        p = min(pLarger, pSmaller)

        # only use gaussian p-value when there is variation, but bootsrap p = 0
        stdev = np.std(data)
        if stdev == 0 or p != 0:
            pGauss = p
        else:
            # Normalizing
            pGauss = st.norm(np.mean(result['bootstrap_counts']), stdev).cdf(result['count'])
            pGauss = min(pGauss,1-pGauss)
        return pGauss

    
    def trimTraining(trainDf, params):

        # trim to most recent
        trainDf = trainDf.sort_values(timestamp, ascending =False)
        trainDfTrimmed = trainDf[:params['maxTrainingSizeMultiple']*len(testDf)]

        return trainDfTrimmed
    
    # Returns names of categorical columns
    def getCategoricalColumnNames(df):
        columnNames = []
        for columnName in df.keys():
            if (type (df[columnName].iloc[0])) == str:
                columnNames.append(columnName)
        return columnNames
    
    
    def test(trainDf, testDf, params):
        # get all of the string columns
        columnNames = getCategoricalColumnNames(testDf)

        bootstrapDf = trimTraining(trainDf, params)

        # set up dict, add counts
        results = {}


        for columnName in columnNames:

            # if it isn't a string column, reject it
            if type(testDf[columnName].iloc[0]) != str:
                continue
            categories = (bootstrapDf[columnName].append(testDf[columnName])).unique()
            if len(categories) > params['maxCategories']:
                continue

            results[columnName] = {}
            testCounts = testDf[columnName].value_counts(dropna = False)
            for i in np.arange(1,len(categories) -1):
                if(pd.isna(categories[i])):
                    categories = np.delete(categories, i)  
            for category in categories:
                results[columnName][category] = {'bootstrap_counts':[],

                                                 'count':testCounts.get(category,0)}
        # resample, add boostrap counts
        for ii in range(params['bootstrapResamples']):

            # Draw random sample from training
            sampleDf = bootstrapDf.sample(len(testDf), replace=True)
            for columnName in results.keys():

                # count by category
                trainCounts = sampleDf[columnName].value_counts(dropna = False)

                # put results in dict
                for category in results[columnName].keys():
                    boostrapCount = trainCounts.get(category,0)
                    results[columnName][category]['bootstrap_counts'].append(boostrapCount)

        # convert to records, add p-values
        bootstrap_results = []
        for columnName in results.keys():
            for category in results[columnName].keys():
                result = results[columnName][category]

                estimatedCount = int(np.round(np.mean(result['bootstrap_counts'])))

                # don't report entries with very low predicted and actual counts
                if estimatedCount < params['minCategoryCount'] and result['count'] < params['minCategoryCount']:
                    continue

                p = pValue(result['bootstrap_counts'],result['count'], result)
                categoryName = category

                # Backup
                if not category:
                    categoryName = "NULL"

                bootstrap_results.append({"column":columnName,
                                   "category":categoryName,
                                   "count":result['count'],
                                   "p": p,
                                   "estimated_count":estimatedCount,
                                   })

        # Sorting by P-values and obtaining Surprise of each
        if(np.count_nonzero(p)>0):
            resultsDf = pd.DataFrame.from_records(bootstrap_results).sort_values('p')
            resultsDf['surprise'] = -np.log2(resultsDf['p'])

            return resultsDf

In [3]:
'''
Utilizes facebook prophet and its ability to predict the future based off specific time context (day, hour, holiday)
to make predictions and test those against the dataset, thus finding anomaly with the context of time
'''
class TimeSeries:
    def __init__(self,x):
        from fbprophet import Prophet
        

In [4]:
"""
"""
class KernelPCA:
    def __init__(self,x):
        print(x)

In [5]:
"""
"""
class Kernel:
    def __init__(self,x):
        print(x)

In [6]:
"""
"""
class PCA:
    def __init__(self,x):
        print(x)

In [7]:
"""
"""
class Categorical:
    def __init__(self,x):
        print(x)

In [12]:
"""
"""
class MultiDimCategorical:
    def __init__(self,x):
        print(x)

In [11]:
"""
"""
class RunInitial:
    def __init__(self,x):
        print(x)

In [13]:
"""
"""
class Report:
    def __init__(self,x):
        print(x)