In [1]:
# Imports for project purposes
# Full Project imports
import pandas as pd
import math as mt
import dateutil
from datetime import datetime, timedelta
import requests as rd
import numpy as np
from sklearn import neighbors, decomposition
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import smtplib
import scipy.stats as st
import os
from datetime import datetime, timedelta
from pandas.api.types import is_numeric_dtype
import copy
from sklearn.model_selection import train_test_split

Vision for each class:
**Anomaly**: Base Class. User can load dataset, set train and test, and obtain a p-value: all the common utility functions. No other use.
**Bootstrap**: Bootstrapping for category counts. Not to be included in report because too slow. User control = hyperparams. A tool.
**TimeSeries**: TimeSeries anomaly estimation with visualization. Optional in initial report. User control = country. 
**Kernel-PCA**: Class used to combine Kernel Density and PCA. Will be run on initial DataSet (means alias needed in class). Returns a formatted table with marked anomaly. Builds off instances of both objects. Should extend Kernel Density class? User control = ???
**Kernel**: Train-test set anomaly as well as choice of running on initial report. Works with Date/Time intervals as well though not sure how this will be incorporated into report. User Control = ???
**PCA**: Report only anomaly? Does it make sense to even have train/test anomaly in this/Kernel Density??? I guess we are checking whether Kernel Model and PCA models for one fit well onto another, meaning inherent differences in shape: how does this apply to specific outliers - we are not testing Kernel Density/PCA as a model, but rather just using it as a means of changing distribution. 
**Categorical** - Returns nicely formatted DataFrame with Categorical outliers. Output also colored. User input = auto anomalous values. Multicategorical is an extension of Categorical

In [2]:
class Anomaly:
    """Base Class for an anomaly detection method"""
    data = None
    trainDf = None
    testDf = None
    
    # Constructor to set values for data
    def __init__(self, input_data = None):
        """
        Constructor for setting dataset reference to a specific dataset
        
        Parameters
        ----------
        input_data: Pandas DataFrame reference - Your dataset in the form of a Pandas DataFrame
        """
        # Ensuring data is properly formatted
        assert input_data is None or type(input_data) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        self.data  = input_data
        
        
    # Loading data into project
    def load_html(self, link: str) -> pd.DataFrame():
        """
        Loads an HTML table and sets it as the dataset for the model.
        
        Common issues: inputting an invalid file path (your file will not be read if this is the case),
        linking another file format (ensure that your link is indeed a link to a website with tables), or giving a 
        link to a website which does not allow scraping of its information.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        """
        self.data = pd.read_html(link)
        return self.data    
    
    
    # Loading data into project
    def load_csv(self, link: str) -> pd.DataFrame():
        """
        Loads an CSV table and sets it as the dataset for the model. 
        
        Common issues: Incorrect file path (ensure your file path is valid), a failure to enter a valid CSV
        (ensure your file is in CSV format)
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        """
        self.data = pd.read_csv(link)
        return self.data
    
    
    #Loading data into project
    def load_excel(self, link: str) -> pd.DataFrame():
        """
        Loads an Exel table and sets it as the dataset for the model.
        
        Common issues: Incorrect file path (ensure your file path is valid), a failure to enter a valid Excel
        (ensure your file is in Excel format), Random spaces within your data (A random space within an Excel file might be read as an NaN value)
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        """
        self.data = pd.read_excel(link)
        return self.data
    
    
    #Loading data into project
    def load_sql_table(sql: str, con, index_col: str = "None") -> pd.DataFrame():
        """
        Loads a SQL table and sets it as the dataset for the model.
        
        Parameters
        ----------
        link: String - The link to the dataset that is being loaded
        
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the entire DataFrame that has just been loaded as the dataset for the bootstrap model
        """
        self.data = pd.read_sql_table(sql, con, index_col)
        return self.data
    
    
    # Setter for the training set
    def set_train(self, trainingSet: pd.DataFrame):
        """
        A setter for the training set.
        
        Parameters
        ----------
        trainingSet: pandas DataFrame - A DataFrame object that will serve as your training set
        """
        self.trainDf = trainingSet
    
    
    # Setter for the test set
    def set_test(self, testSet: pd.DataFrame):
        """
        A setter for the test set.
        
        Parameters
        ----------
        testSet: pandas DataFrame - A DataFrame object that will serve as your training set
        """
        self.testDf = testSet
        
    
    # Randomly split train and test set
    def assign_train_test(self,random_state = 42, training_set_ratio = 0.8, shuffling = True):
        """
        A default random splitter into train and test set
        
        Parameters
        ----------
        random_state : int - determines random state fed to model for reproducability of random results, default is 42
        training_set_ratio: float between 0.0 and 1.0 - what % of your data you would like to encompass the training set (test set will be made in complimentary way) default is 0.8
        shuffling: boolean - whether or not you would like your data randomly shuffled out of chronology prior to split (True/False). Default is True.
        """
        # Ensuring that data actually exists before splitting
        assert not(self.data is None), "You cannot assign a train and test set out of a dataset that has not been initialized"
        
        # Splitting into train and test
        self.trainDf, self.testDf = train_test_split(self.data, train_size = training_set_ratio, shuffle = shuffling)
        return self.trainDf, self.testDf
    
    
    # Specialized column based P-value function: double ended
    def return_p_value(self,col: pd.Series) -> pd.Series:
        """
        Takes a column a Z-score and plugs it into a cdf, returning back a Pandas Series of P-values
        
        Common issues: If Series does not have a set unique index, values will often become NaN's during computation.
        Ensure your Series has an index prior to Series computations to avoid this issue.
        
        Parameters
        ----------
        col: Pandas Series - A column to compute P-values on
        
        Returns
        -------
        P_value: Pandas Series
                A Pandas Series containing the corresponding P-value (by index and in order) to every individual element
                in the input column.
        """
        # Ensuring only valid values are passed
        assert not col.isnull().values.any(), "return_p_value cannot calculate P values on an NaN element, please ensure your input does not contain elements of class np.nan"
        
        #Since we have a normal distribution, starting by obtaining the z-score
        mean = col.mean()
        std = np.std(col)
        centered = col.to_numpy()- mean
        
        #Now obtaining legitimate p-values
        z_scores = centered/std
        for l in np.arange(len(z_scores)):
            cdf = st.norm.cdf(z_scores[l])
            z_scores[l] = min(cdf, 1-cdf)
        return pd.Series(z_scores, index = col.index)
    
    
 # Drops non-numerical and nan values from a table
    def anomaly_prep(self, tabl: pd.DataFrame)-> pd.DataFrame: 
        """
        Removes non-numerical columns and NaN from the dataset for proccessing purposes
        
        Returns
        -------
        tabl: pandas DataFrame
            A DataFrame containing the processed data   
        """
        # Finding all numerical components of the table so that pca can function
        tabl = self.data.select_dtypes(include = [np.number])
        tabl = tabl.dropna(1)
        return tabl
       
    
    # Assigning colors to problematic values (still grouped with indices so easy to tell)
    # Yellow: mild concern, Orange: serious concern, red - major concern
    def designer(self, frame: pd.DataFrame = None):
        """
        Takes a given DataFrame and Returns a version where every value exceeding a certain threshold will be colored 
        in a certain color to signify increasing levels of anomaly.
        
        Parameters
        ----------
        frame: Pandas DataFrame - A table to be colored
        
        Returns
        -------
        frame: Pandas DataFrame
            The colored version of the table
        """
        # Ensuring that we have a dataset to work with
        if type(frame != pd.DataFrame):
            frame = self.data
            
        assert type(frame) == pd.DataFrame, "You must initialize a Dataset in order to use the class designer"
        
        threshold1 = 5
        threshold2 = 10
        threshold3 = 15
        print("Would you like to reset default issue alert thresholds?[5,10,15]")
        if(input().upper() == 'YES'):
            print("Mild concern threshold (in probability (percentage) of issue being present):")
            threshold1 = float(input())
            print("Moderate concern threshold (in probability (percentage) of issue being present)")
            threshold2 = float(input())
            print("Serious concern threshold (in probability (percentage) of issue being present)")
            threshold3 = float(input())
        temp = self.anomaly_prep(frame)
        styler = frame.style
        for col in temp.columns:
            frame = styler.applymap(lambda x: 'background-color: %s' % 'yellow' if x > threshold1 else 'background-color: %s' % 'light-gray', subset = [col])
            frame = styler.applymap(lambda x: 'background-color: %s' % 'orange' if x > threshold2 else 'background-color: %s' % 'light-gray', subset = [col])
            frame = styler.applymap(lambda x: 'background-color: %s' % 'red' if x > threshold3 else 'background-color: %s' % 'light-gray', subset = [col])
        return frame 
    


In [3]:
class Bootstrap(Anomaly):
    """A class for returning anomaly of categorical column counts, utilizing the metric of surprise (entropy)"""
    data = None
    trainDf = None
    testDf = None
    timestamp = None
    params = None
     
    # Fot fitting data right away
    def __init__(self, dataset: pd.DataFrame = None, timeCol: str = "date_time",  resamples: int = 1000, maxTrainingSizeMult: int = 10, maxCategory: int = 100, minCategories: int = 10):
        """
        Overloaded constructor for attaching dataset immediately, can be done independently within any of the load functions

        Parameters
        ----------
        dataset: String - A pandas data frame reference
        timeCol: String - The name of the primary TimeStamp column. Default = "date_time".
        resamples: int - the number of times the bootstrap resamples. Making this very large will improve accuracy but significantly lower speed. Default = 1000
        maxTrainingSizeMult: int - If there is more than x  = maxTrainingSizeMult ratio of training to test data, trim training data to most recent. Default = 10
        maxCategory: int - Maximum number of categories in a column (to ensure that counts are not tiny and are meaninful), column skipped if value count higher than this. Default = 100
        minCategory: int - if column has a category count that is lower than this value, don't report it in bootstrap surprise. Default = 10.
        """
        self.timestamp = timeCol
        self.data = dataset
        
        # Meta-parameter initialization
        self.params = {
          "bootstrapResamples": resamples,
          "maxTrainingSizeMultiple":maxTrainingSizeMult, # if there is more than X times more training data, trim to most recent
          "maxCategories":maxCategory,
          "minCategoryCount": minCategories,
        }
        
        
    
    # Converts Timetamp column of DataFrame to a legitimate timestamp
    def convert_timestamp_to_datetime(self, formatting: str = '%Y%m%d %H:%M:%S') -> pd.DataFrame:
        """
        Converts a chosen timestamp column from string to date/time, making the modifications both to the fitted
        Data Frame and returning the new Data Frame
        
        Parameters
        ----------
        timestamp: String - The name of the Timestamp column that needs conversion
        formatting: String - If formatting different from default = %Y%m%d %H:%M:%S, enter the format of your TimeSeries column
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the locally the entire DataFrame with the modified Timestamp column
        """
        self.data[self.timestamp] =  pd.to_datetime(self.data[self.timestamp], format = formatting)
        return self.data
    
    
    # Splits data into train and test set based on date/time
    def split_train_test_by_time(self, batchHours: int = 24*7):
        """
        Splits Data into a train and test set, held within the object
        
        Parameters
        ----------
        batchHours: int - Size of the test set in terms of hours. Default is one week (24 * 7).
        """
        data_type = type(self.data[self.timestamp].iloc[0])
        assert  data_type != str, "Timestamp Column should be converted to datetime via convert_timestamp_to_datetime prior to split" 
        
        # Splitting by interval = batchHours
        maxTs = max(self.data[self.timestamp])
        batchTs = maxTs - timedelta(hours = batchHours)
        self.testDf = self.data[self.data[self.timestamp] > batchTs]
        self.trainDf = self.data[self.data[self.timestamp] < batchTs]
        
 
    # Helpers and Math
    def p_value(self,data, threshold: np.number, result: pd.DataFrame) -> np.array:
        """
        Returns the p-value of a computation
        
        Parameters
        ----------
        data: Pandas DataFrame - The Data we are computing the P-value on
        threshold: np.number - The threshold to check if data is anomalous
        result: pd.DataFrame - A DataFrame containing the column "Bootstrap counts" to be normalized and tested for anomaly
        
        Returns
        -------
        pGauss : np.array
            Returns the array of normalized p-values for each bootstrap count
        """
        # Taking the smaller of the 2 p-values(either could present large anomaly)
        pLarger = sum(np.array(data) >= threshold) / len(data)
        pSmaller = sum(np.array(data) <= threshold) / len(data)
        p = min(pLarger, pSmaller)

        # only use gaussian p-value when there is variation, but bootsrap p = 0
        stdev = np.std(data)
        if stdev == 0 or p != 0:
            pGauss = p
        else:
            # Normalizing
            pGauss = st.norm(np.mean(result['bootstrap_counts']), stdev).cdf(result['count'])
            pGauss = min(pGauss,1-pGauss)
        return pGauss

    
    def trim_table(self,df: pd.DataFrame) -> pd.DataFrame:
        """
        Trims a DataFrame, ensuring that it does not exceed the training set max size hyper parameter
        
        Parameters
        ----------
        df: pandas DataFrame - The DataFrame that is being trimmed to fit to the training set hyperparameter
        
        Returns
        -------
        dfTrimmed : pandas DataFrame
            Returns a DataFrame fit to the training set specifications
        """
        # trim to most recent
        df = df.sort_values(self.timestamp, ascending =False)
        dfTrimmed = df[:self.params['maxTrainingSizeMultiple']*len(self.testDf)]

        return dfTrimmed
    
    
    # Returns names of categorical columns
    def get_categorical_col_names(self, df: pd.DataFrame) -> []:
        """
        Returns the names of categorical columns in a Pandas DataFrame (if the type is a string)
        
        Parameters
        ----------
        df: pandas DataFrame - The DataFrame whose columns are checked for being categorical data
        
        Returns
        -------
        columnNames : list
            The list of all categorical column names 
        """
        columnNames = []
        for columnName in df.keys():
            if (type (df[columnName].iloc[0])) == str:
                columnNames.append(columnName)
        return columnNames
    
    
    def train_test_anomaly(self) -> pd.DataFrame:
        """
        Tests for difference between training and test set counts, returning a report that quantifies difference between
        training and test set as surprise.
        
        Returns
        -------
        resultsDf : pandas DataFrame
            A DataFrame containing a report for the difference between expected and detected counts within the test set 
            With the inclusion of a column quantifying irregularity as surprise (entropy)
        
        """
        # Preventative measures
        assert type(self.trainDf) == pd.DataFrame and type(self.testDf) == pd.DataFrame, "Please set up your train and test sets as Pandas DataFrames prior to attempting this step"
        
        # get all of the string columns
        columnNames = self.get_categorical_col_names(self.testDf)

        bootstrapDf = self.trim_table(self.trainDf)

        # set up dict, add counts
        results = {}


        for columnName in columnNames:

            # if it isn't a string column, reject it
            if type(self.testDf[columnName].iloc[0]) != str:
                continue
            categories = (bootstrapDf[columnName].append(self.testDf[columnName])).unique()
            if len(categories) > self.params['maxCategories']:
                continue

            results[columnName] = {}
            testCounts = self.testDf[columnName].value_counts(dropna = False)
            
            
            for i in np.arange(1,len(categories) -1):
                if(pd.isna(categories[i])):
                    categories = np.delete(categories, i)  
            for category in categories:
                results[columnName][category] = {'bootstrap_counts':[],

                                                 'count':testCounts.get(category,0)}
        # resample, add boostrap counts
        for ii in range(self.params['bootstrapResamples']):

            # Draw random sample from training
            sampleDf = bootstrapDf.sample(len(self.testDf), replace=True)
            for columnName in results.keys():

                # count by category
                trainCounts = sampleDf[columnName].value_counts(dropna = False)

                # put results in dict
                for category in results[columnName].keys():
                    boostrapCount = trainCounts.get(category,0)
                    results[columnName][category]['bootstrap_counts'].append(boostrapCount)

        # convert to records, add p-values
        bootstrap_results = []
        for columnName in results.keys():
            for category in results[columnName].keys():
                result = results[columnName][category]

                estimatedCount = int(np.round(np.mean(result['bootstrap_counts'])))

                # don't report entries with very low predicted and actual counts
                if estimatedCount < self.params['minCategoryCount'] and result['count'] < self.params['minCategoryCount']:
                    continue

                p = self.p_value(result['bootstrap_counts'],result['count'], result)
                categoryName = category

                # Backup
                if not category:
                    categoryName = "NULL"

                bootstrap_results.append({"column":columnName,
                                   "category":categoryName,
                                   "count":result['count'],
                                   "p": p,
                                   "estimated_count":estimatedCount,
                                   })

        # Sorting by P-values and obtaining Surprise of each
        if(np.count_nonzero(p)>0):
            resultsDf = pd.DataFrame.from_records(bootstrap_results).sort_values('p')
            resultsDf['surprise'] = -np.log2(resultsDf['p'])

            return resultsDf
        

In [4]:
from fbprophet import Prophet

class TimeSeries(Anomaly):
    """
    Utilizes facebook prophet and its ability to predict the future based off specific time context (day, hour, holiday)
    to make predictions and test those against the dataset, thus finding anomaly with the context of time. Please ensure 
    you set your train and test sets prior to computation.
    """
    data = None
    trainDf = None
    testDf = None
    prophetDf = None
    countryModel = None
    forecast = None
    timestamp = None
    
    def __init__(self,timeStampInput: str = 'date_time', inp_data = None, train = None,test = None):
        """
        Does not require any inputs, but gives user option to initialize input data/train/test right from the get-go
        
        Common Issues: Please ensure you set the name of your timestamp column. Default value  = "date_time". Not setting the name
        of this column will result in an error.
        
        Parameters
        ----------
        inp_data: pandas DataFrame - A given data set
        train: pandas DataFrame - A given training set
        test: pandas DataFrame - A given test set
        """
        
        # Ensuring that if user has given us input, it is of the correct form
        assert inp_data is None or type(inp_data) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        self.data  = inp_data
        
        # Ensuring training set is of the correct form
        assert train is None or type(train) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        self.trainDf  = train
        
        # Ensuring test set is of the correct form
        assert test is None or type(test) is pd.core.frame.DataFrame, "inputted data is not a pandas DataFrame"
        self.testDf = test
        
        
        self.timestamp = timeStampInput
     
    
    # Converts Timetamp column of DataFrame to a legitimate timestamp
    def convert_timestamp_to_datetime(self, formatting: str = '%Y%m%d %H:%M:%S') -> pd.DataFrame:
        """
        Converts a chosen timestamp column from string to date/time, making the modifications both to the fitted
        Data Frame and returning the new Data Frame
        
        Parameters
        ----------
        timestamp: String - The name of the Timestamp column that needs conversion
        formatting: String - If formatting different from default = %Y%m%d %H:%M:%S, enter the format of your TimeSeries column
        
        Returns
        -------
        data : pandas.DataFrame
            Returns the locally the entire DataFrame with the modified Timestamp column
        """
        self.data[self.timestamp] =  pd.to_datetime(self.data[self.timestamp], format = formatting)
        return self.data
    
            
    def truncate_timestamp_column(self, ts) -> pd.Series:
        """
        Truncates a timestamp column to the hour percision (minute, second, and microsecond all set to 0)
        
        Parameters
        ----------
        ts: pandas Series - A given Timestamp column to be truncated
        
        Returns
        -------
        ts : pandas Series
            The Timestamp column truncated to the hour percision
        """
        return ts.dt.floor("H")
    
    
    def group_and_build_time_table(self,truncated: bool = False) -> pd.DataFrame:
        """
        Builds a table on the basis of the value counts (or rather the log 10 of the value counts)
        
        Parameters
        ----------
        truncated: boolean - A value representing whether or not the training set has already been truncated to the hour. Default is False. If this is the case, the data will be autotruncated. 
        
        Returns
        -------
        prophetDf : pandas Data Frame
                   A table with value grouping by value counts
        
        """
        # Making a copy as to not mess up reference
        truncatedData  = self.trainDf.copy()
        
        # Truncating timestamp if needed
        if truncated == False:
            truncatedData = self.truncate_timestamp_column(truncatedData[self.timestamp])
        else:
            truncatedData = truncatedData[self.timestamp]
        groupedCounts = truncatedData.value_counts()
        
        # Grouping counts in a single DataFrame
        self.prophetDf = pd.DataFrame({'ds':groupedCounts.index,'y':np.log10(groupedCounts.values)})
        return self.prophetDf
    
    
    # Takes in the the dataset and the prophet dataset returned by the ast option
    def train_model_on_country(self, country: str = "US"):
        """
        Trains a Facebook Prophet model on a specified country with a linear growth algorithm and an interval
        width of one sigma. Default country is the United States. Will fit this country model onto TimeSeries table.
        
        Common issues: Downloading Prophet can be very messy and certain modifications might need to be made to 
        enable holidays such as Easter. You can read more about this issue here: 
        
        https://github.com/facebook/prophet/issues/1293
        
        Parameters
        ----------
        Country: String - The name of a valid country included in the Prophet seasonality package
        
        Returns
        -------
        self.countryModel: Table
                          Facebook Prophet model fitted onto the country of your choice (is also now contained as
                          an instance variable)
        
        """
        # Ensuring inputted country is a string
        assert type(country) == str, "Given country should be formatted as a string"
        # Train model
        self.countryModel = Prophet(#daily_seasonality = True, 
                    #yearly_seasonality = False, 
                    #weekly_seasonality = True, 
                    #growth='linear',
                    interval_width=0.68 # one sigma
                   )
        self.countryModel.add_country_holidays(country_name=country)

        self.countryModel.fit(self.prophetDf)
        return self.countryModel
    
    
    # Splits data into train and test set based on date/time
    def split_train_test_by_time(self,batchHours: int = 24*7):
        """
        Splits Data into a train and test set, held within the object
        
        Parameters
        ----------
        batchHours: int - Size of the test set in terms of hours. Default is one week (24 * 7).
        """
        maxTs = max(self.data[self.timestamp])
        batchTs = maxTs - timedelta(hours = batchHours)
        self.testDf = self.data[self.data[self.timestamp] > batchTs]
        self.trainDf = self.data[self.data[self.timestamp] < batchTs]
        

    # Applies Prophet analytics to create a forecast based on hours
    def predict_future(self, timestamp: str = "date_time"):
        """
        Builds (and returns) a future forecast for comparison to test set (which should be further ahead in time relative 
        to the trainig set). Made on the basis of the number of hours which encompass the test set.
        
        Returns
        -------
        self.forecast: Table
                      A forecast representiative of the predictions the Prophet Model would assume the test set to be,
                      based on the training set.
        """

        # Takes in trained model and predicts the future
        # find number of hours to preduct: ceil of hours in testDf
        
        # Obtaining interval contained by test set for computation purposes.
        timeDelta = max(self.testDf[timestamp]) - min(self.testDf[timestamp])

        #If a column is string, convert to date/time
        if self.testDf.applymap(type).eq(str).any()[timestamp]:
            self.testDf[self.timestamp] = pd.to_datetime(self.testDf[self.timestamp])

        timeDelta = max(self.testDf[self.timestamp]) -min(self.testDf[self.timestamp])
        hours = int(timeDelta.days*24 + timeDelta.seconds/(60*60))+1
        future = self.countryModel.make_future_dataframe(periods = hours, freq = 'H')
        self.forecast = self.countryModel.predict(future)
        return self.forecast

    def train_test_anomaly(self, country = "US"):
        """
        Fits Prophet Model on Train set and runs predictions on test set, finding the difference between expected counts and 
        TimeSeries predicted counts while also accounting for the anomaly in the difference between the 2 estimations.

        Common Issues: Ensure your timestamp has been set as the correct value prior to run.

        Paramters
        ---------
        Country: String - the name of the country whose holidays and timezone you would like to train the Prophet Time Series model on

        Returns
        -------
        prophetResultsDf: Pandas DataFrame
                      A table containing difference between observed and expected counts, sorted in order of the metric of surprise
                      (How anomalous/chaotic/entropic the data is)
        """
        # If train and test sets have not been split prior to run, split them
        if(type(self.train_test_anomaly != pd.DataFrame)):
            self.split_train_test_by_time()

        # Training the model on country
        print("Train and Test Sets Prepared\n")
        tms.group_and_build_time_table(False)
        tms.train_model_on_country(country)
        print("Model trained...\n")

        #  Predicting values
        tms.predict_future(self.timestamp)
        print("Future predicted...\n")

        # Returning anomaly estimation
        return tms.surprise_estimator(country)

        
    def surprise_estimator(self, country: str = "US") -> pd.DataFrame:
        """
        Based on the training-set-reliant prediction of the future, calculates the anomaly between the training and the
        test set utilizing the metric of surprise.
        
        Paramters
        ---------
        Country: String - the name of the country whose holidays and timezone you would like to train the Prophet Time Series model on
        
        Returns
        -------
        prophetResultsDf: Pandas DataFrame
                      A table containing difference between observed and expected counts, sorted in order of the metric of surprise
                      (How anomalous/chaotic/entropic the data is)
        """
        # Processing training set and making predictions
        truncated = self.truncate_timestamp_column(self.trainDf[self.timestamp])
        groupedCounts = truncated.value_counts()

        prophetTestDf = pd.DataFrame({'ds':groupedCounts.index,
                                      'y':np.log10(groupedCounts.values),
                                      'y_linear':groupedCounts.values})

        # find p-value
        prophet_results = []

        # Comparing test and training set data for identical intervals
        for ii in range(len(prophetTestDf)):
            ts = prophetTestDf['ds'][ii]
            fcstExample = self.forecast[self.forecast['ds'] == ts]
            mean = fcstExample['yhat'].iloc[0]
            stdev = (fcstExample['yhat_upper'].iloc[0] - fcstExample['yhat_lower'].iloc[0])/2

            # Calculating the P-value
            p = st.norm(mean, stdev).cdf(prophetTestDf['y'][ii])
            p = min(p,1-p)

            prophet_results.append({"column":"Forecast",
                               "category":str(ts),
                               "count":prophetTestDf['y_linear'][ii],
                               "p": p,
                               "estimated_count":int(np.round(np.power(10,mean))),
                               })

        # Obtaining Entropy of Time-Series values
        prophetResultsDf = pd.DataFrame.from_records(prophet_results).sort_values('p')
        prophetResultsDf['surprise'] = -np.log2(prophetResultsDf['p'])
        return prophetResultsDf

    
    # Takes in a model that has been trained on country, plots graphs for visualization
    def visualize(self):
        """
        Builds plots for the forecast, displaying its construction on the basis of certain time intervals utilizing the 
        country fitted model self.countryModel
        
        Common issues: this step cannot be completed until you have trained the model on country (train_model_on_country)
        and made a forecast (predict_future). Please complete these prior steps to build the forecast predictions prior to attempting
        to visualize them.
        """
        # Model visualization
        fig = self.countryModel.plot(self.forecast)
        fig = self.countryModel.plot_components(self.forecast)
        

ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.


In [110]:
class KernelPCA(Anomaly):
    """Combines Kernel Density and PCA into a join proccess that runs on all numerical columns to triangulate outliers"""
    data = None
    knal = None
    principalComp = None
    trainDf = None
    testDf = None
    replace_nans_with = None
    
    def __init__(self,dataInit: pd.DataFrame = None, trainInit: pd.DataFrame = None, testInit: pd.DataFrame = None, replace_nans_with = False):
        """
        Gives user the option to initialize Data Set, Training Set, and Test Set
        
        Parameters
        ----------
        data: pandas DataFrame - A given data set
        train: pandas DataFrame - A given training set
        test: pandas DataFrame - A given test set
        """
        # Initializing datasets prior to build with neccessary information
        self.data = dataInit
        self.trainDf = trainInit
        self.testDf = testInit
        self.replace_nans_with = replace_nans_with
        
        
    # Builds Kernel and PCA models based on input parameters
    def build(self) -> str:
        """
        Builds Kernel and PCA objects for run on dataset
        
        Common Issues: You must either load your dataset or set up both train and test set prior to running this method. You can do this 
        using the methods in the Anomaly() class (which this class extends)
        
        Returns
        -------
        "Build Successful" : str
                         Indication that the objects have been built successfully
        """
        assert type(self.data) == pd.DataFrame or (type(train) == pd.DataFrame and type(test)== pd.DataFrame), "Please set up your dataset or your train and test sets prior to running build()"
        self.knal = Kernel(self.data,self.trainDf, self.testDf, replace_nan_with = self.replace_nans_with)
        self.principalComp = PCA_Analysis(self.data, self.trainDf, self.testDf, replace_nan_with = self.replace_nans_with)
        return "Build Successful"
        
    
    # Runs a Data Quality Test on Current DataSet: for Report
    def run_on_current_data(self) -> pd.DataFrame:
        """
        Runs the Kernel Density- Principal Component Analysis anomaly test Combo on current dataset, returning the surprise metric for every element in a DataFrame
        
        Common Issues: An index must be provided to group the data, and can only be run after build() has been run and dataset has
        been initialized.
        
        Paramters
        ---------
        index: String - An index to use in building Kernel Density and Principal Component Analysis Models
        
        Returns
        -------
        final_anomaly: Pandas DataFrame
                      A DataFrame with an anomaly approximation for each value in the dataset
        """
        
        # Finding respective Kernel and PCA anomalies
        knal_anomaly = self.knal.surprise_of_table()
        pca_anomaly = self.principalComp.surprise_on_table()
        
        # Obtaining PCA Surprise factor
        pcaSurpriseCol = pca_anomaly.get("Surprise")
        temp = self.principalComp.pca_prep(knal_anomaly)
        
        # Traversing and Multiplying anomaly
        for column in temp.columns:
            knal_anomaly[column] = (knal_anomaly[column].multiply(pcaSurpriseCol)).apply(np.sqrt)
            
        # Sorting by Surprise of values
        final_anomaly = knal_anomaly.sort_values(by = "mean_surprise", ascending = False)
        return final_anomaly

    
    
    # Calculates Anomaly between Train and Test
    def train_test_anomaly(self) -> pd.DataFrame:
        """
        Runs the Kernel Density- Principal Component Analysis anomaly test Combo on train and test sets, returning the surprise metric for every test-set element in a DataFrame
        
        Common Issues: Can only be run after build() has been run and train + test sets have been initialized.
        
        Returns
        -------
        final_anomaly: Pandas DataFrame
                      A DataFrame with an anomaly approximation for each value in the test set
        """
        knal_anomaly = self.knal.train_test_anomaly()
        pca_anomaly = self.principalComp.train_test_anomaly()
        
        # Obtaining PCA Surprise factor
        pcaSurpriseCol = pca_anomaly.get("Surprise")
        temp = self.principalComp.pca_prep(knal_anomaly)
        mean_surp = np.zeros(temp.shape[0])
        
        # Traversing and Multiplying anomaly
        for column in temp.columns:
            knal_anomaly[column] = (knal_anomaly[column].multiply(pcaSurpriseCol)).apply(np.sqrt)
            mean_surp += knal_anomaly[column].values
            
        # Adding in mean_surprise
        knal_anomaly = knal_anomaly.assign(mean_surprise = mean_surp / knal_anomaly.shape[1])
        
            
        # Sorting by Surprise of values
        final_anomaly = knal_anomaly.sort_values(by = "mean_surprise", ascending = False)
        return final_anomaly


In [38]:
class Kernel(Anomaly):
    """Column-based numerical outlier tester that utilizes fitting a Kernel and obtaining a density estimation"""
    data  = None
    trainDf = None
    testDf = None
    timestamp_anom = None
    timestamp = None
    replace_nan = None
      
    def __init__(self, data_value:pd.DataFrame = None, train_set: pd.DataFrame = None, test_set: pd.DataFrame = None, timestamp_anom: bool = False,timestamp_init = 'date_time', replace_nan_with = False):
        """
        Object constructor which allows user to set dataset as default and initialize train+test sets. 
        Keep in mind the dataset can also be loaded by means of any method in the base Anomaly class.
        You also don't have to initialize your train and test sets right away and can use our random splitter to do so prior to running the Kernel Density itself!
        
        Parameters
        ----------
        data_value: Pandas DataFrame - The dataset you would like to fit Kernel Density to
        train_set: Pandas DataFrame  - The train set you would like to use for Kernel Density purposes (default = None)
        test_set: Pandas DataFrame  - The test set you would like to use for Kernel Density purposes (default = None)
        timestamp_anom: boolean - A boolean variable indicating whether or not user wants to run Kernel Density on a date/time column (Default = False)
        timestamp: String - The name of the timestamp column (default = 'date_time')
        """
        self.data = data_value
        self.trainDf = train_set
        self.testDf = test_set
        self.timestamp_anom = timestamp_anom
        
        # Ensuring that user wants anomaly
        if(self.timestamp_anom == True):
            self.timestamp = timestamp_init
        
        # Warning beginning user if train-test balance is off
        if type(self.trainDf) == pd.DataFrame and type(self.testDf) == pd.DataFrame and self.trainDf.shape[0] < self.testDf.shape[0]:
            print('Warning: training set larger than test set. Could potentially damage results')
        
        self.replace_nan = replace_nan_with
        
        
    # Using cosine kernel function to get estimate for log density
    def cosine_kernel(self, stat: pd.Series) -> np.ndarray:
        """
        Fits a Cosine Kernel to the data and scores samples by their density.
        Used for distributions with low variability.
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosine_kernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        cos_density: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        """
        
        # Ensuring that there are no NaN values that may break the Kernel process
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernel_prep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'cosine').fit(stat)
        cos_density = l.score_samples(stat)
        return cos_density
    
    
    # Using gaussian kernel function to get estimate for log density
    def gaussian_kernel(self,stat: pd.Series) -> np.ndarray:
        """
        Fits a Gaussian Kernel to the data and scores samples by their density.
        Used for distributions with standard Gaussian variability.
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosine_kernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        density: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        """
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernel_prep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'gaussian').fit(stat)
        density = l.score_samples(stat)
        return l, density
    
    
    # Using linear kernel function to get estimate for log density
    def exponential_kernel(self,stat: pd.Series)-> np.ndarray:
        """
        Fits an Exponential (fatty tailed) Kernel to the data and scores samples by their density.
        Used for distributions with high variability.
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosine_kernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        density: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        """
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernel_prep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'exponential').fit(stat)
        expDensity = l.score_samples(stat)
        return l, expDensity
    
    
    # Using epanechnikov kernel function to get estimate for log density
    def parabolic_kernel(self,stat: pd.Series) -> np.ndarray:
        """
        Fits a Parabolic/Epenechnikov Kernel to the data and scores samples by their density. 
        Used for distributions with low variability
        
        Common issues: argument stat should be a Pandas Series: not a DataFrame or an array. Should contain numerical values only,
        NaN's and non numerical values will break the cosine_kernel.
        
        Parameters
        ----------
        stat: Pandas DataFrame - A Pandas Series you would like to fit the cosine Kernel to
        
        Returns
        -------
        epDensity: np.ndarray
                  A 2 dimensional array with all the scored log densities. Can read more here:
                  https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
        """
        assert not stat.isnull().values.any(), "Please make sure to preprocess your table prior so that Nan's and non-numerical values are removed, you can do so with the Kernel.kernel_prep() method"
        stat = stat.to_numpy().reshape(-1,1) 
        l = neighbors.KernelDensity(kernel = 'epanechnikov').fit(stat)
        epDensity = l.score_samples(stat)
        return l, epDensity
    

    # Drops non-numerical and nan values from a table
    def kernel_prep(self, data = None)-> pd.DataFrame: 
        """
        Removes non-numerical columns and NaN from the dataset for proccessing purposes
        
        Common Issues: Ensure that the either the data value is passed or your class dataset is initialized.
        Failure of either will result in an error.
        
        Returns
        -------
        tabl: pandas DataFrame
            A DataFrame containing the processed data   
        """
        # If dataset not given, assume it is our preinitialized dataset
        if(type(data)!= pd.DataFrame):
            data = self.data
            
        # Finding all numerical components of the table so that pca can function
        tabl = data.select_dtypes(include = [np.number])
        if type(self.replace_nan) != bool:
            self.replace_nan = float(self.replace_nan)
            tabl.replace(to_replace = [np.nan], value =[self.replace_nan], inplace=True)
        else:
            tabl = tabl.dropna(1)
        return tabl
    
    
    # Fits proper Kernel and returns the surprise by element
    def surprise_estimator(self,stat: pd.Series) -> pd.Series:
        """
        Returns the surprise on a per-value basis of the Kernel density estimation for a particular column.
        
        Common issues: Series should have a set index, one that generalizes to the rest of the dataset. Not having
        such an index will cause trouble when grouping all columns into a single DataFrame.
        
        Parameters
        ----------
        stat: Pandas Series - A Series of numerical values with a set index.
        
        Returns
        -------
        Surprise: Pandas Series
            A Series containing each value index and its corresponding suprise.
        
        """
        # Surprise Metric
        model, metrics = self.kernel_decider(stat)
        pVals = self.return_p_value(pd.Series(metrics))
        surprise = -np.log2(pVals)
        return surprise.sort_values()
    
    
    def kernel_decider(self,stat: pd.Series)-> pd.Series:
        """
        Returns Kernel density estimations for a particular column, decided based off the column's variability.
        
        Common issues: Series should have a set index, one that generalizes to the rest of the dataset. Not having
        such an index will cause trouble when grouping all columns into a single DataFrame.
        
        Parameters
        ----------
        stat: Pandas Series - A Series of numerical values with a set index.
        
        Returns
        -------
        metric: Pandas Series
            A Series containing each value index and its corresponding Kernel Density Estimation.
        """
        # Calculating maximum number of deviations from the mean so as to choose proper Kernel model
        mean = stat.mean()
        dev = stat.std()
        numDevMax = (stat.max() - mean)/dev
        numDevMin = (stat.min() - mean)/dev
        numDev = max(numDevMax, numDevMin)
        
        metric = None
        model = None
        
        # Assigning appropriate Kernel Estimator on the basis of model's variability
        if(numDev > 3.2):
            model,metric = self.exponential_kernel(stat)
        elif((numDev <=3.2) & (numDev >= 2)):
            model, metric = self.gaussian_kernel(stat)
        else:
            model, metric = self.parabolic_kernel(stat)
            
        return model, metric

    
    # Calculation of date time entropies
    def date_time_interval_anomaly(self,column: pd.Series)-> pd.DataFrame:
        """
        Returns the anomaly for the intervals between the timestamps, in order of anomaly (surprise), using ordinary data index.
        
        Common issues: Given column should be a timestamp in either string or datetime format. Integer or floating point timestamps
        will cause issues. Timestamps before 1970 (Unix Threshold) will also cause issues.
        
        Parameters
        ----------
        column: Pandas Series - An Series representing date/time values in a DataFrame.
        
        Returns
        -------
        dateSurprise: Pandas DataFrame
            A DataFrame consisting of the table index, the interval, and the corresponding surprise.
        """
        # Conversion to proper format
        if (type(column.iloc[0]) == str):
            column = self.convert_to_date_time(column)

        # Unix timestamps for ease of calculation
        unixCol = column.apply(self.convert_to_unix).to_numpy()

        # Finding time intervals
        difference_array = np.append(np.array([]), np.diff(unixCol))
        timeFrame = (pd.DataFrame().assign(index = np.arange(1,len(unixCol)), Times_diff = difference_array))
        dateSurprise = timeFrame.assign(surprise = self.surprise_estimator(timeFrame.get("Times_diff")))
        print(dateSurprise.sort_values(by = ['surprise'], ascending = False))
        return dateSurprise.get("surprise")

    
    # If date-value is given as a string, convert to date- time format first
    # TODO: make adaptable to all forms of date/time?
    def convert_to_date_time(self,column: pd.Series) -> pd.Series:
        """
        Converts a column to date time with format('%Y%m%d %H:%M:%S')
        
        Parameters
        ----------
        column: Pandas Series - A 'datetime-valued' Series to be converted to type = datetime
        
        Returns
        -------
        column: Pandas Series
            A datetime Pandas Series of type datetime 
        """
        return pd.to_datetime(column, format='%Y%m%d %H:%M:%S')

        
    # Converting the date to unix format for ease of calculations    
    def convert_to_unix(self,value: datetime) -> datetime:
        """
        Returns a unix timestamp version of a given timestamp.
        
        Common issues: Passed value is not of type datetime / Passed value is before 1970 will both yield errors.
        
        Parameters
        ----------
        value: datetime - An ordinary datetime timestamp
        
        Returns
        -------
        value: datetime
            The number of total seconds since January 1st, 1970
        """
        return (value - datetime(1970, 1, 1)).total_seconds()

    
    # A grouping of the entire kernel estimation process
    def surprise_of_table(self, index: str = None) -> pd.DataFrame:
        """
        Returns Surprise values for a whole table based off Kernel Density estimations (per column).
        
        Common issues: Table should have a set index, or one should be specified for the index argument. Not having
        such an index will cause trouble when grouping all column Estimations into a single DataFrame.
        
        Parameters
        ----------
        index: String - An optional index to set for the new table containing per column suprise: otherwise current set index will be used (default = None)
        
        Returns
        -------
        metric: Pandas Series
            A Series containing each value index and its corresponding Kernel Density Estimation.
        """
        
        #Preprocessing data
        temp = self.kernel_prep(self.data)
        copyTable = copy.copy(self.data)
        
        # Timestamp anomaly
        if self.timestamp_anom == True:
            copyTable[self.timestamp] = self.date_time_interval_anomaly(self.data[self.timestamp])

        # Checking if index given, if it isn't will just use Table's current default index
        if type(index) == str:
            index = self.data.get(index)
        else:
            index = self.data.index
            

        #Obtaining surprise of every individual column
        sum_surprise  = pd.Series(np.zeros(self.data.shape[0]))
        for col in temp.columns:
            stat = temp.get(col)
            copyTable[col] = self.surprise_estimator(stat)
            sum_surprise+=copyTable[col]
        
        # If we have a TimeSeries column, adding it to sum surprise
        if(self.timestamp_anom == True):
            sum_surprise += copyTable[self.timestamp]
            
        # Averaging our surprise so we can sort by it
        sum_surprise = sum_surprise.array
        
        copyTable = copyTable.set_index(index)
        copyTable = copyTable.assign(mean_surprise = np.round(sum_surprise/copyTable.shape[1],2))

        # Sorting table for easier visualization
        copyTable = copyTable.sort_values(by = "mean_surprise", ascending  = False)
        return copyTable
    
    
    def train_test_anomaly(self):
        """
        Given a preloaded train and test set, calculates metric of surprise based off the difference between Train and Test Kernel densitites
        
        Common Issues: Since this method is comparing by column, the train and test datasets must have the same columns with the same names
        prior to runtime.
        
        Returns
        -------
        surprise: pd.DataFrame
                A DataFrame containing the anomaly measure for every single column.
        """
        # Copying Dataset without becoming an alias.
        surprise = self.kernel_prep(data = self.testDf)
        
        # Trimming train and test sets
        trainTrimmed = self.kernel_prep(data = self.trainDf)
        testTrimmed = self.kernel_prep(data = self.testDf)
        
        for col in trainTrimmed.columns:
            # Finding difference between train and test distributions
            model, trainKernel = self.kernel_decider(trainTrimmed[col])
            
            # The observed densities.
            model2, testKernel = self.kernel_decider(testTrimmed[col])
            
            # The expected values for the test-densities if the test set was identical to the training distribution
            testKernelExpected = model.score_samples(testTrimmed[col].to_numpy().reshape(-1,1))
            difference = pd.Series(data = testKernel-testKernelExpected, index = testTrimmed.index)
            difference = difference.replace({np.nan: 0, np.inf: 100000, -np.inf: -100000})
            # Computing surprise
            surprise[col] = -np.log2(self.return_p_value(difference))
            
        return surprise
            
        


In [203]:
class PCA_Analysis(Anomaly):
    """
    Row-based outlier techniques that utilizes dimensionality reduction to understand systematic bias by row.
    The dimensionality reduction is a great way of finding mutli-dimensional outliers without the destruction of computation speed.
    """
    
    data = None
    pcaData = None
    threshold = None
    trainDf  = None
    testDf = None
    replace_nan = None
    
    #Initializing dataset, train, and test sets.
    def __init__(self, dataInit: pd.DataFrame = None, train:pd.DataFrame = None, test:pd.DataFrame = None, s:float = .95, replace_nan_with = False):
        """
        Reduces dimensionality while maintaining s% of variablity, then uses dimension mixtures called PC's to find systematic bias across every 
        column in the row (checks for anomaly by row)
        
        Common issues: The value of s (the variability) should be between 0 and 1 (it is a proportion) and not setting it in
        this range will yield an error. Additionally, the PCA class alone does not have a train and test set evaluation and thus
        does not have train and test set variables, so attempting to set them in the constructor will also yield an error.
        
        Parameters
        ----------
        dataInit: Pandas DataFrame - The Data that will be used for the PCa evaluation
        s: float - The floating point hyperparameter for the percent of variance (information) in the dimensionality reduction that you'd like to maintain (default = .95)
        replace_nan_with: float - A value to use to replace NaN values instead of just dropping columns that include them (default = False)
        """
        self.data = dataInit
        self.threshold = s
        self.trainDf = train
        self.testDf = test
        assert self.threshold > 0 and self.threshold < 1, "Please ensure your threshold is between 0 and 1"
        self.replace_nan = replace_nan_with
        

    # Drops non-numerical and nan values from a table
    def pca_prep(self, data: pd.DataFrame = None)-> pd.DataFrame: 
        """
        Removes non-numerical columns and NaN from the dataset for proccessing purposes
        
        Parameters
        ---------
        data: Pandas DataFrame - A Table to prep for PCA
        
        Returns
        -------
        tabl: pandas DataFrame
            A DataFrame containing the processed data   
        """
        # Finding all numerical components of the table so that pca can function
        if type(data) != pd.DataFrame:
            data = self.data
        
        tabl = data.select_dtypes(include = [np.number])
        
        # Giving user choice to replace missing values instead of dropping them altogether
        if type(self.replace_nan) != bool:
            self.replace_nan = float(self.replace_nan)
            tabl.replace(to_replace = [np.nan], value =[self.replace_nan], inplace=True)
        else:
            tabl = tabl.dropna(1)
        return tabl
    
    
    def obtain_variance_table(self, data: pd.DataFrame = None) -> pd.DataFrame:
        """
        Evaluates how much variability each Principal Component accounts for, returned in the format of a DataFrame.
        
        Common issues: You must set a Data Set for this method to run on prior to evaluation. Not doing so will yield an
        error.
        
        Parameters
        ----------
        data: Pandas DataFrame - Data to obtain the respective variances of (default = self.data)
        
        Returns
        -------
        infoFrame: Pandas DataFrame
            A Table containing each Principal Component and the variance that it accounts for
        """
        
        # Ensuring that we have our dataset prior to running this method 
        if type(data) != pd.DataFrame:
            data = self.data
            
        assert type(data) == pd.DataFrame, "Please set a Data Set prior to running this method"
        
        # Scaling and preparing values for PCA
        data = self.pca_prep(data)
        scaled_data= StandardScaler().fit_transform(data)

        # Creating a PCA object 
        pca = PCA(n_components = (data.shape[1]))
        pcaFit = pca.fit_transform(scaled_data)
        infoFrame = pd.DataFrame().assign(Column = ["PC" + str(i) for i in range(data.shape[1])], Variance_ratio = pca.explained_variance_ratio_ )
        return infoFrame
    
    
    def obtain_pca_vals(self, componentNum: int, data = None) -> pd.DataFrame:
        """
        Returns the Principal Component values for the first n principal components.
        
        Common issues: the componentNum argument should not exceed the dimensionality (number of columns) of the table.
        
        Parameters
        ----------
        componentNum: int - The number of Principal Components you would like to keep
        data: Pandas DataFrame - A dataset to find the PCA values for
        
        Returns
        -------
        metric: Pandas DataFrame
            A Table containing a reduced number of Principal Components
        """
        # Default value
        if type(data) == None:
            data = self.pcaData
            
        # Building new model off n Principal Components
        pca = PCA(n_components = componentNum)
        pcaFit = pca.fit_transform(data)
        return pca, pcaFit
    
    
    # Deciding how many columns need to be used: utilizing threashold of 95% of the explained variance
    def element_decider(self,infoFrame: pd.DataFrame)-> int:
        """
        Based off the variance table evaluation, decides how many Principal Compoenents need to be kept to maintain variance threshold
        
        Common issues: Ensure that the Pandas DataFrame you are passing to this method is the variance table obtained 
        via the obtain_variance_table method. Passing this method a difference table will yield an error.
        
        Parameters
        ----------
        infoFrame: Pandas DataFrame - A table containing each Principal Component and the variance it accounts for 
        
        Returns
        -------
        counter: int
            The count of how many sequential Principal compoenents need to be kept to meet the variance threshold
        """
        numSum = 0
        counter = 0

        # Continuing until we have accounted for 95% of the variance
        for i in infoFrame.get("Variance_ratio"):
            if(numSum < self.threshold):
                numSum += i
                counter+=1
        return counter

    
    # Reducing dimensionality of data into pc's, only storing what is neccessary
    def reduced_data(self,infoFrame: pd.DataFrame, pcaData, index: pd.Series) -> pd.DataFrame:
        """
        Reduces dimensionality of the data to the Class threshold and calculates the values of each of the Principal components
        
        Parameters
        ----------
        infoFrame: Pandas DataFrame - A table with the % of variance accounted for by each Principal Component. Obtained via self.obtain_variance_table()
        pcaData: 2 dimensional list - A 2 dimensional list of the scaled Data for processing purprose
        index: Pandas Series - A Pandas Series to use as the index for the data
    

        Returns
        -------
        pcaFrame: Pandas DataFrame
            A Pandas DataFrame containing the values from the reduced Principal Components.
        """

        numCols = self.element_decider(infoFrame)
        pca, pcaVals = self.obtain_pca_vals(numCols, pcaData)
        pcaFrame = pd.DataFrame(pcaVals)

        # Dealing with potential index issues
        pcaFrame = pcaFrame.set_index(index)
        return pca, pcaFrame
    
    
    #Summing p-values because PCA serves to check for systematic bias, whereas kernel density checks for accuracy
    def sum_rows(self,pcaVals: pd.DataFrame) -> np.array:
        """
        Sums the absolute value of Principal Components (due to Orthogonality, direction does not matter)s
        
        Parameters
        ----------
        pcaVals: Pandas DataFrame - A table full of values returned from a PCA evaluation (self.reduced_data)
        
        Returns
        -------
        metric: Pandas Series
            A Series containing each value index and its corresponding Kernel Density Estimation.
        """
        sumArray = np.zeros(pcaVals.shape[0])
        for i in np.arange(pcaVals.shape[1]):
            values = pcaVals.get(str(i)).array
            sumArray += abs(values)
        sumArray /= pcaVals.shape[1]
        #After obtaining sum, the average deviation from the expected value is averaged out, not taking in absolute value
        # to check for systematic error
        return sumArray


    # Tests for systematic bias by row
    def pca_row_outliers(self, pcaVals: pd.DataFrame) -> pd.DataFrame:
        """
        Computes average anomaly (Surprise) on table full of Principal Components, by row.
        
        Common issues: An extremely low P-value may yield a Surprise of infinity: this still does not guarantee anomaly,
        but rather means that the result is exponentially unlikely based off the computation.
        
        Parameters
        ----------
        pcaVals: Pandas DataFrame - A DataFrame full of Principal Components and their corresponding scores.
        
        Returns
        -------
        newVals: Pandas DataFrame
            A DataFrame containing average anomaly estimations by row under the column of surprise.
        """
        P_val_table = pd.DataFrame()

        #Creating a table of all the PCA p-values
        for col in np.arange(0,pcaVals.shape[1]):
            P_vals =  self.return_p_value(pcaVals.get(col))
            i = str(col)
            P_val_table[i] = P_vals
        totalVar = self.sum_rows(P_val_table)

        #Calculating surprise by taking negative log
        newVals = pcaVals.assign(Surprise = -np.log2(totalVar))
        newVals = newVals.sort_values(by = "Surprise", ascending = False)
        return newVals

    
    # Master method to run PCA as a whole
    def surprise_on_table(self) -> pd.DataFrame:
        """
        Runs the entire PCA process on the data it has fit itself onto the loaded dataset
        
        Common issues: Ensure given index is a valid Column within your table. Data Set must also be initialized prior to run.
        See other function documentations for more common issues.
        
        Parameters
        ----------
        index: string - The name of a valid column that you would like to serve as your index.
        
        Returns
        -------
        new_pca: Pandas DataFrame
            A table Containing anomaly projections based on Principal Component Analysis value, sorted in order by surprise
        """
        processing_table = self.pca_prep(self.data)
        variance_table = self.obtain_variance_table(self.data)
        pca, pcaVals = self.reduced_data(variance_table, StandardScaler().fit_transform(processing_table), self.data.index)
        new_pca = self.pca_row_outliers(pcaVals)
        return new_pca
    
    def train_test_anomaly(self) -> pd.DataFrame:
        """
        Utilizes PCA to find difference between multi-dimensional shape of one distribution vs. the other.
        
        Common issues: Ensure given index is a valid Column within your table. Train and Test must be initialized prior to run.x
        See other function documentations for more common issues.
        
        Parameters
        ----------
        index: string - The name of a valid column that you would like to serve as your index.
        
        Returns
        -------
        new_pca: Pandas DataFrame
            A table Containing anomaly projections of the test set based on Principal Component Analysis value, sorted in order by surprise
        """
        # Preproccessing and varifying there are the same columns
        processing_table = self.pca_prep(self.trainDf)
        processing_test = self.pca_prep(self.testDf)
        
        # Removing all nonmutual columns
        removeCols = []
        for a in processing_test.columns:
            if a not in processing_table.columns:
                removeCols.append(a)
        processing_test = processing_test.drop(columns = removeCols)
        
        removeColumns = []
        for i in processing_table.columns:
            if i not in processing_test.columns:
                removeColumns.append(i)
        processing_table = processing_table.drop(columns = removeColumns)
        
        # Building a Training Model so that we can use it to transform test set values
        variance_table = self.obtain_variance_table(processing_table)
        pca, pcaVals = self.reduced_data(variance_table, StandardScaler().fit_transform(processing_table), self.trainDf.index)
        
        # The observed fit onto the test set, which will be compared to the expected fit
        v_table = self.obtain_variance_table(processing_test)
        model, values = self.reduced_data(v_table, StandardScaler().fit_transform(processing_test), self.testDf.index)

        # Transforming test set values, the expected fit.
        transformed_test = pd.DataFrame(pca.transform(processing_test)).set_index(self.testDf.index)
        
        # Finding difference between computations and obtaining anomaly
        test_diff = transformed_test.subtract(values)
        test_diff = test_diff.replace({np.nan:0})
        new_pca = self.pca_row_outliers(test_diff)
        return new_pca

In [423]:
# Yet to be documented
class Categorical(Anomaly):
    """Uses dynamically built data "grammar conventions" to find outliers based on defiance of strict structures"""
    
    data = None
    trainDf = None
    testDf = None 
    userParams = None
    userFactor = None
    
    # Initializing data, train and test sets, and potential outlier characters/words
    def __init__(self,data_init = None , train_init = None , test_init = None, user_params_init:[] = None, user_surprise_factor:int = None):
        """
         Object constructor which allows user to set dataset and initialize train+test sets. 
        Keep in mind the dataset can also be loaded by means of any method in the base Anomaly class.
        You also don't have to initialize your train and test sets right away and can use our random splitter to do so prior to running the Categorical Anomaly itself!
        User also given the option to initialize values that if found add to the anomaly factor by n automatically. This can be done with user_params_init
        in the constructor.
        
        Parameters
        ----------
        data_init: Pandas DataFrame - The dataset you would like to fit Kernel Density to (default = None)
        train_set: Pandas DataFrame  - The train set you would like to use for Kernel Density purposes (default = None)
        test_set: Pandas DataFrame  - The test set you would like to use for Kernel Density purposes (default = None)
        user_params_init: list - A list of numerical or string values that, if found in the dataset, automatically increase surprise by user_surprise_factor points
        user_surprise_factor: int - The factor to increase a value by if user_params_init has been found in a given data cell
        """
        
        self.data = data_init
        self.trainDf = train_init
        self.testDf = test_init
        userParams = user_params_init
        userFactor = user_surprise_factor
        
     
    # Option to initialize user params during runtime
    def initialize_user_params(self,params: [], factor: np.number):
        """
        Used to initialize parameters that automatically lead to increase in Anomaly at any point in the run
        
        Parameters
        ----------
        params: list - A list of numerical or string values that, if found in the dataset, automatically increase surprise by n = factor points
        factor: int - The factor to increase a value by if a member of params has been found in a data cell.
        """
        self.userParams = params
        self.userFactor = factor
        
    
    # Will examine whether or not a column is categorical, giving the user the opportunity to add additional numeric columns
    # Should it automatically identify all String columns as categorical?
    def identify_categorical(self,surpriseFrame: pd.DataFrame, supress: bool = False)-> []:
        """
        Identifies the Categorical columns in a table. Default = all string columns, with potential of user input to add other types.
        
        Common issues: Will automatically identify all columns of string type as categorical.  
        
        Parameters
        ----------
        SurpriseFrame: pd.DataFrame: A table to identify the categorical columns of
        
        Returns
        -------
        categorical_list: list
            A list of all the Categorical columns in the dataset
        """
        categorical_list = []
        for col in surpriseFrame.columns:
            if(not(is_numeric_dtype(surpriseFrame[col]))):
                categorical_list.append(col)

        # Allows fixing of default assumption that numeric columns aren't categorical
        print("Are there any numeric Columns you would consider categorical?(yes/no)")
        while input().upper() == "YES" and (supress == False):
            print("Enter the name of one such column:")
            categorical_list.append(input())
            print("Any more?(yes/no)")
        return categorical_list
    
    
    # Returns suprise of type classification
    def types(self,column: pd.Series) -> pd.Series:
        """
        Tests the data type 'grammatical rule', calculating anomaly based off how often certain data types are followed
        
        Common issues: Only looks at 3 categories: boolean, numeric, or string.
        
        Parameters
        ----------
        column: Pandas Series - A column to test for types 
        
        Returns
        -------
        surpriseVal: Pandas Series
            A measure of anomaly for the types test of the column using the metric of surprise, ordered by index
        """
        value_types = column.apply(self.classifier)
        counts  = value_types.value_counts(normalize = True)
        index = counts.index
        values = counts.values
        probs = value_types.apply(self.give_prob, args = (np.array(index), np.array(values)))
        surpriseVal = probs.apply(self.surprise)
        return surpriseVal

    
    # Obtains the type of value, even if it is currently contained within a string
    def classifier(self, value) -> str:
        """
        Identifies the data type of a certain value (even if stored in a string)
        
        Common issues: Only identifies 3 data types: number, string, boolean.
        
        Parameters
        ----------
        value: A primitive python value
        
        Returns
        -------
        ret: String
            A String represetning the data type of the input value
        """
        value = str(value)
        # Boolean check done manually: this is an easy check
        if(('True' in value) or ('False' in value )):
            return 'boolean'
        else:
            if(value.isnumeric()):
                return 'number'
            else:
                return 'string'

            
    # Takes in a column and returns the surprise of each nan value being present (True) or not being present (False)
    def nans(self, column: pd.Series) -> pd.Series:
        """
        Tests data for missing (NaN) values, quantifying anomaly based off the regularity of missing values.
        
        Parameters
        ----------
        column: Pandas Series - A column to test for nan values
        
        Returns
        -------
        surpriseVal: Pandas Series
            A Series representing anomaly estimations for missing and nonmissing values based off the regularity of nans
        """
        nan_values = column.apply(self.is_nan)
        counts  = nan_values.value_counts(normalize = True)
        index = counts.index
        values = counts.values
        probs = nan_values.apply(self.give_prob, args = (np.array(index), np.array(values)))
        surpriseVal = probs.apply(self.surprise)
        return surpriseVal

    
    # Takes in a column and returns the surprise of the length of each value in the column: the first and simplest of probabilistic tests
    def len_count(self, column: pd.Series)-> pd.Series:
        """
        Tests for anomaly by sequence length (A standard sequence length being violated could be anomalous)
        
        Parameters
        ----------
        column: Pandas Series - A column to test for length
        
        Returns
        -------
        surpriseVal: Pandas Series
            A Series representing anomaly estimations for respective values based off standards for sequence length
        """
        column = column.apply(str)
        counts = column.apply(len).value_counts(normalize = True)
        index = counts.index
        values = counts.values
        column = column.apply(len)
        probs = column.apply(self.give_prob, args = (np.array(index), np.array(values)))
        surpriseVal = probs.apply(self.surprise)
        return surpriseVal

    
    # Calculates the surprise of a given value
    def surprise(self, value):
        """
        Returns surprise metric
        
        Common Issues: -log2(0) does not exist, so will return infinite values for p-values that are negligibly close to 0
        
        Parameters
        ----------
        value: float - A floating point p-value
        
        Returns
        -------
        surprise: float
            A number representing the entropy/anomaly of the data member
        """
        return -np.log2(value)

    
    # Given a numerical value, finds it equivalent within the set of indices and assigns it the proper probability
    def give_prob(self, value, index, values):
        """
        Maps values to respective probabilities.
        
        Common issues: value must correspond to a value in the index
        
        Parameters
        ----------
        value: primitive - A value corresponding to some element in the index
        index: [] - Respective value labels
        values:[] - A list of corresponding probabilities to the index list
        
        
        Returns
        -------
        ret: float
            The respective probability of the value inputted
        """
        for num in np.arange(len(index)):
            if(value == index[num]):
                return values[num]
        return values[0]


    # NaN's aren't equal to themselves
    def is_nan(self, x) -> bool:
        """
        Tests if a value is a Null/NaN value by checking if it is equal to itself
        
        Parameters
        ----------
        x: primitive value - A value to test for being a null/NaN value
        
        Returns
        -------
        ret: bool
            A Boolean representing whether the value is a null/NaN value
        """
        return x!=x
 

    # Checks for special characters within a string, calculating surprise so as to identify which character combinations are chaotic
    def special_char(self, column: pd.Series) -> pd.Series:
        """
        Tests for anomaly by special character sequence and presence
        
        Parameters
        ----------
        column: Pandas Series - A column to test for special characters
        
        Returns
        -------
        surpriseVal: Pandas Series
            A Series representing anomaly estimations for respective values based off standards for special character sequence/ presence
        """
        characters = column.apply(str).apply(self.char_identifier)
        counts  = characters.value_counts(normalize = True, dropna = False)
        index = counts.index
        values = counts.values
        probs = characters.apply(self.give_prob, args = (np.array(index), np.array(values)))
        surpriseVal = probs.apply(self.surprise)
        return surpriseVal

    
    # Checks if a single entry of any data type contains special symbols and returns all that it contains
    def char_identifier(self, entry) -> str:
        """
        Finds the sequence of special characters in a given entry
        
        Parameters
        ----------
        entry: String - A String to test for special characters
        
        Returns
        -------
        ret_string: String
            The sequence of special characters in the entry.
        """
        charList = np.array(['<', '>', '!', '#','_','@','$','&','*','^', ' ', '/', '-','"','(', ',', ')', '?', '.'])
        ret_string = ""
        for i in charList:
            if(i in entry):
                ret_string += i
        return ret_string

    
    # Simpler approach here: if the value counts of certain elements are greater when they should be unique, they are more suprising
    # If they are non-unique when they are supposed to be unique, also more surprising. Done with binary classification
    def uniques(self, column: pd.Series) -> pd.Series:
        """
        Tests for anomaly by uniqueness of value (A value being unique in a series full of nonunique could be anomalous)
        
        Parameters
        ----------
        column: Pandas Series - A column to test for uniqueness
        
        Returns
        -------
        surpriseVal: Pandas Series
            A Series representing anomaly estimations for respective values based off standards for uniqueness
        """
        # Counting number of each value and returning whether or not it is a singular unique value,
        #then counting truly unique values
        vals = column.value_counts(dropna = False).apply(self.isunique)
        vals = column.apply(self.unique_assignment, args = [vals])
        counts = vals.value_counts(normalize = True, dropna = False)
        index = counts.index
        values = counts.values
        probs = vals.apply(self.give_prob, args = (np.array(index), np.array(values)))
        surpriseVal = probs.apply(self.surprise)
        # Note: if all values unique/non unique this will provide definite outcome because no room for uncertainty
        return surpriseVal

    
    # Returns whether the count of a value is 1
    def isunique(self, val) -> bool:
        """
        Checks if a value is unique
        
        Parameters
        ----------
        val: A value count - A value count to be tested
        
        Returns
        -------
        ret: Boolean
            A True/False value corresponding to uniqueness of the value whose count was given
            """
        return (val == 1)

    
    # Maintains individual values without grouping while assigning unique / nonunique probabilities. To be used on original column
    def unique_assignment(self, val, column):
        """
        Maps a unique/nonunique label to corresponding probability
        
        Parameters
        ----------
        val: boolean - A value indicating uniqueness
        column: Series - A column containing probabilities for unique(True) and non-unique(False) elements.
        
        Returns
        -------
        value: float
            Corresponding probability of unique/ nonunique value
        """
        if self.is_nan(val):
            value = column[pd.Series(column.index).apply(self.is_nan).tolist()]
        else:
            value = column[column.index == val]
        return value.iloc[0]
  

    # Obtains a date time object and treats this as numerical rather than categorical value
    def surprise_on_table(self, supress = False) -> pd.DataFrame:
        """
        Builds grammatical expectations for table, calculating Surprise based off the commonality with which such tendencies are broken
        
        Common Issues: This is a Categorical method and does not take into account numerical magnitude or datetime interval.
        
        Parameters
        ----------
        table: Pandas DataFrame - A table to test for anomaly
        
        Returns
        -------
        table: Pandas DataFrame
            The modified table with the each value being replaced by its respective measure of anomaly
        """
        table = self.data
        cols = self.identify_categorical(table, supress)
        for col in cols:
            # Obtaining individual relative entropies, averaging them out, finding their p-values and calculating final surprise
            values = table.get(col)
            factors = values.apply(self.apply_user_factor)
            temp = (self.uniques(values)+ self.special_char(values)+ self.nans(values) + self.types(values)+ self.len_count(values))/5
            
            # Finding Surprise and adding user defined factors
            table[col] = -np.log2(self.return_p_value(temp)) + factors
        
        # Replacing any uncomputable values with 0
        table = table.replace({np.nan:0})    
        return table

    
    def apply_user_factor(self, value) -> int:
        """
        Used to initialize parameters that automatically lead to increase in Anomaly at any point in the run
        
        Parameters
        ----------
        value: Python Primitive value - A value to search userParams for, applying the userFactor if found
        
        Returns
        -------
        Factor: int
            0 if the userFactor has not been applied, and self.userFactor if it has (based off whether value in userParams)
        """
        # Ensuring a user factor exists
        if self.userParams != None and value in self.userParams:
            return self.userFactor
        else: 
            return 0
    
    
    # Proxy train test anomaly: checks test set anomaly as a function of their combined distribution
    def train_test_anomaly(self):
        """
        A proxy train test anomaly function that evaluates test set categorical anomaly as a subset of the combined train and test
        set rules. For more information, see Categorical.surprise_on_table() method.
        
        Common issues: Train and Test Sets must be initialized prior to run of this method and must contain some categorical data
        used to evaluate. For this purpose, any non-numerical data is by default rendered Categorical Data. Other columns could be
        rendered Categorical by the user.
        
        Returns
        -------
        surprise: Pandas DataFrame
            Evaluated Anomaly metrics for test set as a function of surprise (entropy).
        """
        # Ensuring train and test sets have been initialized
        assert type(self.trainDf) == pd.DataFrame and type(self.testDf) == pd.DataFrame, "Train and test sets must be initialized prior to run"
        
        # Grouping train and test sets to form a distribution
        total_set  = pd.concat([self.trainDf, self.testDf], ignore_index = False)
        
        # Calculating entropy of joint distribution
        surprise = self.surprise_on_table(total_set)
        
        # Returning only train-set Surprise metric values
        return surprise.iloc[self.testDf.index.tolist()]


In [None]:
cat = Categorical()
cat.

In [439]:
cat.data = pd.read_excel("sampleDataSet.xlsx")


In [440]:
cat.assign_train_test()

(                    Email  Profession       Hometown  Height       Salary  \
 148         Rob@gmail.com          DJ  San Diego, CA    64.0      45000.0   
 70          Rob@gmail.com      Doctor  San Diego, CA    64.0      45000.0   
 118         Gil@gmail.com  Programmer  San Diego, CA    76.0      55000.0   
 137     Herbert@gmail.com       Nurse   San Digo. CA    61.0      50000.0   
 85          Rob@gmail.com          DJ  San Diego, CA    73.0      60000.0   
 40          Gil@gmail.com      Doctor  San Diego, CA    52.0      60000.0   
 94         Ross@gmail.com  Programmer  San Diego, CA    72.0      55000.0   
 13         Ross@gmail.com      Doctor  San Diego, CA    52.0      45000.0   
 79         Ross@gmail.com          DJ  San Diego, CA    77.0      60000.0   
 77       Robert@gmail.com       Nurse  San Diego, CA    75.0      50000.0   
 116     Herbert@gmail.com      Doctor  San Diego, CA    74.0      45000.0   
 66      Herbert@gmail.com      Doctor  San Diego, CA    74.0   

In [None]:
# Yet to be documented
class MultiDimCategorical(Categorical):
    """
    Utilizes the idea of mutual entropy to build first order and 2nd order approximations for a 
    given column based on randomly chosen/handpicked context. Only runs a Train/Test Evaluation: not in Initial Report.
    """
    data = None
    train = None
    test = None 
    
    # Initializing data, train and test sets, and potential outlier characters/words
    def __init__(self,data_init  = None , train_init = None , test_init = None, ):
        """
         Object constructor which allows user to set dataset and initialize train+test sets. 
        Keep in mind the dataset can also be loaded by means of any method in the base Anomaly class.
        You also don't have to initialize your train and test sets right away and can use our random splitter to do so prior to running the Categorical Anomaly itself!
        
        Parameters
        ----------
        data_init: Pandas DataFrame - The dataset you would like to fit Kernel Density to
        train_set: Pandas DataFrame  - The train set you would like to use for Kernel Density purposes (default = None)
        test_set: Pandas DataFrame  - The test set you would like to use for Kernel Density purposes (default = None)
        """
        
        self.data = data_init
        self.train = train_init
        self.test = test_init

        
    # Finds the most chaotic combinations (meaning we actually have distribution on our hands) and returns them as array
    # To be used for naive Bayes model
    def obtain_random_functional_cols(frame: pd.DataFrame, test_column: pd.Series) -> (str, str): 
        """
        Returns 2 potential columns that can be used for building a counts distribution. In the Process, eliminates all columns
        that are completely unique, equal to the current test column, or are completely non-unique so that a counts distribution can be built.
        
        Common Issues: Does not eliminate issue of heavy correlation between the columns as some columns may be categorical,
        so the later application of naive bayes might be too naive (new information is weighted too strongly)
        
        Parameters
        ----------
        frame: Pandas DataFrame - A table from which to obtain the columns
        test_column: Pandas Series - The test column which is used for the mutual counts distribution, passed so that it will not be selected.
        
        Returns
        -------
        cols[point], cols[point2]: 2 element tuple of strings
                2 fitting column names to be used to build our counts distribution
        """
        cols = []
        for x in frame.columns:  
            # Number of unique values, typecasted as an integer
            val = len(frame.get(x).value_counts())
            # Taking out edge cases
            if(val != 1 and val < len(x) and x != test_column):
                cols.append(x)
        point = np.random.randint(0, len(cols))
        point2 = np.random.randint(0, len(cols))
        while(point == point2):
            point2 = np.random.randint(0, len(cols))   
        return cols[point], cols[point2]

    
    # Categorical only DataFrame to be used later for proccessing purposes
    def returnCategoricalFrame(self,frame: pd.DataFrame) -> pd.DataFrame:
        """
        Returns only the Categorical Columns in a Pandas DataFrame
        
        Common Issues: Mixed-type columns with String inputs are automatically identified as Categorical, and no Numeric 
        Inputs found to be Categorical unless specified so during runtime.
        
        Parameters
        ----------
        frame: Pandas DataFrame - A table to search for categorical column
        
        Returns
        -------
        cols: Pandas DataFrame
            A table consisting of categorical columns only
        """
        cols = identifyCategorical(frame)
        return (frame[cols]).reset_index()

    
    # Changing numerical columns to behave as categorical columns for purpose of predicting categorical columns
    def modify_numerics(self, frame: pd.DataFrame, categorical_columns: [], test_column: pd.Series) -> pd.DataFrame:
        """
        Transforms numerical values into categorical numbins for counts purposes (numerical values often completely unique)
        
        Common Issues: Causes a number to lose several potentially useful numerical properties, but needed for counts purpose.
        
        Parameters
        ----------
        frame: Pandas DataFrame - A table whose numerical columns will be modified
        categorical columns: list - A list of Categorical Columns to ignore throughout numerical proccessing.
        test_column: Pandas Series - The tested Series, which will also be ignored during numerical proccessing.
        
        Returns
        -------
        frame: Pandas DataFrame
            A DataFrame whose numerical values have been modified and placed in numbins (numerical -> categorical)
        """
        numericColumns = frame.select_dtypes(include = [np.number])
        numericColumns = numericColumns[numericColumns != test_column]
        for col in numericColumns:
            if(col not in categorical_columns):
                frame[col] = frame[col].apply(convert_to_percentile, args = ([frame[col].to_numpy()]))
                frame[col] = frame[col].apply(place_in_numbin)
        return frame

    
    # Converts individual values to percentiles
    def convert_to_percentile(self, val, values) -> float:
        """
        Finds the Percentile a Value belongs in 
        
        Parameters
        ----------
        val: number - A value to find the percentile of 
        values: list of numbers - A list to use to check the percentile of.
        
        Returns
        -------
        percentile: float
            The Percentile of val as a member of the values list
        """
        return percentileofscore(values, val)


    # Places numeric values in separate individual quartiles
    def place_in_numbin(self, sval: float) -> int:
        """
        Places a value in a numbin (percentile bins increasing by increments of 10)
        
        Common Issues: Causes a number to lose several potentially useful numerical properties, but needed for counts purpose.
        
        Parameters
        ----------
        sval: float - A value to place in a numbin
        
        Returns
        -------
        numbin: int
            A categorical numbin in which the value is placed
        """
        # Returning the quartile in which the number belongs
        if(isNan(val)):
            return 0
        else:
            return int(val/10)


    # A simple counts-based probability distribution
    def zero_order_approx(self, table: pd.DataFrame, column: pd.Series) -> pd.Series:
        """
        Builds a probability distribution for the counts of the a given column, to be used in hte first order approximation
        
        Parameters
        ----------
        table: Pandas DataFrame - The Table from which to take the desired column for the counts distribution
        column: Pandas Series - The Series that is used to build the zero order counts distribution
        
        Returns
        -------
        otherCol: Pandas Series
                A Series representative of the count distribution (by proportion) of the data
        """
        table = table.replace({np.nan:0})
        totalRows = table.shape[0]
        counts = table.groupby(column).count()
        otherCol = table.columns[0]
        return counts[otherCol] / totalRows

    
    # Yields a Bayesian distribution using the context provided by another column, builds upon zeroOrder in complexity
    def first_order_approx(self, data: pd.DataFrame, column: pd.Series, other: pd.Series) -> pd.DataFrame:
        """
        Builds a first order approximation based off the mutual counts distribution built by grouping by both columns
        
        Common Issues: This may be quite expensive, and if data extremely large could potentially take quite a long time.
        In very extreme cases, could cause Kernel to crash.
        
        Parameters
        ----------
        data: Pandas DataFrame - The table from which the columns come from
        column: Pandas Series - The test column whose mutual distribution we are attempting to discern (in Bayesian thinking, our distribution A in P(A|B))
        other: Pandas Series - The column used to give 'context' for the test column (in Bayesian thinking, our distribution B in P(A|B))
        
        
        Returns
        -------
        finalProb: Pandas DataFrame
            A table consisting of the mutual probability distribution for a first order approximation.
        """
        # Ensuring that NaN values do not mess up value counts
        table= data.replace({np.nan:0})

        totalRows = table.shape[0]
        # Finding unique combinations to calculate P(B|A)
        counts = table.groupby([column,other]).count()
        otherCol = table.columns[0]

        # Calculating initial probabilities for A and B 
        initialA = zero_order_approx(table,column)
        initialB = zero_order_approx(table,other)

        #P(B|A)
        conditionalBGivenA = (counts[otherCol] / totalRows)
        updatedProb = conditionalBGivenA.multiply(initialA).reset_index().set_index(other)
        # Utilizing probability and given knowledge to yield an expected probability
        finalProb = pd.DataFrame().assign(data_value = updatedProb.get(column), given_knowledge = updatedProb.index,probability = updatedProb[otherCol].divide(initialB).values)
        return finalProb.set_index("data_value")


    # Builds a model to obtain context and create a first order surprise approximation for a given column
    def surprise_approx(self,column: pd.Series) -> pd.DataFrame:
        """
        Given a Train and Test Set and a column to test, finds metric of anomaly (surprise) between train and test set distributions given first order context
        
        Common Issues: Train and Test Set must have exact same column names for this purpose, otherwise potential of break.
        Functions on one column at a time
        
        Parameters
        ----------
        column: Pandas Series - A column to test via first order approx
        
        Returns
        -------
        test: Pandas DataFrame
            A table modeling difference between train and test sets and approximating metric of surprise.
        """
        # Placing numerical values in bins
        self.train = modify_numerics(self.train, returnCategoricalFrame(self.train).columns,column)
        self.test = modify_numerics(self.test, returnCategoricalFrame(self.test).columns,column)

        # Finding columns that are potential candidates for knowledge base based off the fact that not everything is a unique value
        trainCandidate, testCandidate2 = obtain_random_functional_cols(self.train, column)
        train_probs = first_order_approx(self.train, column, trainCandidate).reset_index()

        # Building a new index which can be used to measure difference between train and test set
        train_probs = train_probs.assign(indx = train_probs.get("data_value").apply(str) + " " + train_probs.get("given_knowledge").apply(str))
        train_probs = train_probs.set_index("indx")

        #Building a new index which can be used to measure difference between train and test set
        test_probs = first_order_approx(self.test, column, trainCandidate).reset_index()
        test_probs = test_probs.assign(indx = test_probs.get("data_value").apply(str) + " " + test_probs.get("given_knowledge").apply(str))
        test_probs = test_probs.set_index("indx")

        # Finding the difference and calculating surprise
        test = test_probs.assign(diff = test_probs.get("probability").subtract(train_probs.get("probability"), level = "indx", fill_value = 0))
        test = test.assign(surprise = surprise(retPVal(test.get("diff"))))
        return test.sort_values(by = "surprise", ascending = False)

In [None]:
class Report(Anomaly):
    """An extention of the RunInitial class that offers a more verbose and visual report for an initial anomaly scan"""
    data = None
    colDist = None
    knalPca = None
    categorical = None
    report  = None
    
    
    def __init__(self, dataInit: pd.DataFrame = None, autoInit: bool = True):
        """
        Object constructor which allows user to set dataset and decide whether grouping columns into Numerical, 
        Categorical, and Timestamp values will happen automatically. Keep in mind the dataset can also be loaded 
        by means of methods in the base Anomaly class.
    
        Parameters
        ----------
        data_init: Pandas DataFrame - The dataset you would like to use for the purposes of the Report
        autoInit: Boolean  - A True/False value deciding whether split into Categorical/Numeric/Datetime columns will be automated (default = True: automated)
        """
        self.data = dataInit
        if(autoInit == True):
            self._split_columns()
    
    
    # Default splitter for columns if user does not want to initialize
    def _split_columns(self):
        """A default automatic column splitter that splits into Datetime, Numerical, and Categorical Columns """
        # Temporarily replacing NaN's that could screw up proccess 
        temp = self.data.replace({np.nan: 0, "NaN": 0})
        numCols = []
        catCols = []
        dateTime = []
        
        # All Numerical types assigned to numCols
        numCols = temp.select_dtypes(include = [np.number])
        dateTime = temp.select_dtypes(include = [np.datetime64, np.timedelta64])
        catCols = temp.select_dtypes(exclude = [np.number, np.datetime64, np.timedelta64])
        
        # Turning kept columns into a dictionary
        self.colDist = {"Numeric": numCols.columns.to_list(),
                        "Categorical": catCols.columns.to_list(),
                        "Timestamp": dateTime.columns.to_list()} 
    
    
    # Manual user column splitter
    def split_columns(self, numericCols:[] = [], categoricalCols:[] = [], dateTimeCols:[] = []) -> {}:
        """
        A user initializer for the Numerical, Categorical, and Timestamp columns: in case user doesn't want them initialized by default.
        
        Parameters
        ----------
        numericCols: list - A list of the numeric columns in the dataset. Categorical columns with numeric representations can be excluded.
        categoricalCols: list - A list of the Categorical Columns in the Dataset
        dateTimeCols: list - A list of timestamp/datetime columns in the dataset
        
        Returns
        -------
        colDist: Dictionary
            A dictionary representing the distribution of columns into numeric, categorical, and datetime
        """
        self.colDist = {"Numeric": numericCols,
                        "Categorical": categoricalCols,
                        "Timestamp": dateTimeCols}
        return self.colDist
     
        
    def metadata(self) -> (pd.DataFrame, pd.DataFrame):
        """
        Returns a set of tables describing Numeric and Categorical metadata (basic metrics)
        
        Returns
        -------
        numDescribe, categoricalDescribe: tuple(Pandas DataFrame, Pandas DataFrame)
            A tuple consisting of Numeric and Categorical metadata tables.
        """
        # Describing Numerical values
        numDescribe = self.data[self.colDist["Numeric"]].describe()
        
        # Describing Categorical values
        categoricalDescribe = self.data[self.colDist["Categorical"]].describe()
        return numDescribe, categoricalDescribe
    
    
    def run_intial_tests(self, index: str) -> (pd.DataFrame, pd.DataFrame, [pd.DataFrame]):
        """
        Runs Categorical Distribution test on Categorical Column, Kernel-PCA on numeric values, and intervals on Datetime values
        
        Common Issues: DataFrame should preferably have a set index, which should be passed to this method.
        This will avoid value loss at runtime.
        
        Parameters
        ----------
        index: String - A String to use as an index for the Numerical Anomaly table
        
        Returns
        -------
        inital_report: tuple(Pandas DataFrame, pandas DataFrame, [Pandas DataFrame])
            A DataFrame for the Categorical anomaly approximation, another one for Kernel-PCA anomaly approximation (for Numerical values)
            and a list of DataFrames for every datetime interval anomaly approximation (tends to consist of just one element)
        """
        interval_list = []
        # Obtaining all Numeric values
        numColFrame = self.data[self.colDist["Numeric"]]
        
        # Obtaining all Categorical values
        categoricalFrame = self.data[self.colDist["Categorical"]]
        
        # Instanciating Categorical and Kernel-PCA objects
        self.categorical = Categorical(categoricalFrame)
        self.knalPca = KernelPCA(numColFrame)
        
        # Runnning individual surprise approximation on Numerical and Categorical Columns
        categoricalFrame = self.categorical.surprise_on_table(self.categorical.data, True)
        numColFrame = self.knalPca.run_on_current_data(index)
        
        # Running interval Approximations on Surprise Columns, appending them to list to be returned separately
        for i in self.colDist["Timestamp"]:
            interval_list.append(self.knalPca.date_time_interval_anomaly(self.data[i]))
            
        self.report = categoricalFrame, numColFrame, interval_list
        return self.report

    
    def basic_visuals(self, columnType: str = "Numeric", plotType: str  = "hist"):
        """
        Plots columns of columnType as a plot of plotType
        
        Common Issues: Most Pandas plot do not handle Categorical Data very well. plotTypes are also limited to the following 
        list: 
        { line’ : line plot (default), ‘bar’ : vertical bar plot, ‘barh’ : horizontal bar plot, ‘hist’ : histogram,
        ‘box’ : boxplot,‘kde’ : Kernel Density Estimation plot, ‘density’ : same as ‘kde’, ‘area’ : area plot, pie’ : pie plot}
        
        Parameters
        ----------
        columnType: String - The column type to plot. Options = ("Numeric", "Categorical", "Timestamp") (default = "Numeric")
        plotType: String - The type of plot to use. Options = ("line", "hist", "barh", "bar", "pie"...(see more under common Issues)) (default = "hist")
        """
        for col in self.colDist[columnType]:
            print("Column: " + str(col))
            try:
                self.data[col].plot(kind = plotType, legend = True)
                
            # If user attempts to numerically plot categorical data
            except TypeError:
                print("Error: Certain plots require numeric data")

In [None]:
# Idea: add Label learning for anomaly: cluster: have the user label a couple instances, and then proceed to calculate anomaly?