In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix as cmatrix

import umap
import sklearn
from sklearn.neighbors import KernelDensity
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

  import pandas.util.testing as tm


In [17]:
# Let's do a object-oriented programming approach to creating a 
# classifier based on Bayesian inference

# a bayesian classifier must: 
# 1) take in as input training and test data,
# 2) calculate "priors" of the classes based on training data,
# 3) decide on a way to calculate likelihoods (KDE density or Gaussian mixture)
# 4) finally, do posterior odds ratio prediction (or probabilities) by combining
# priors with likelihoods 


In [3]:
#----------------------------------------the main Bayesian classifier class------------------------------

class BayesClassifier:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.categories = np.unique(y)
        self.log_priors = None
        self.log_likelihoods = None
        self.X_features = None
        self.model_dict = None
        
    def preprocess(self, n_features):
        Fext = FeatureExtractor(self.X, self.y)
        Fext.train_test_split().get_features(num_features=n_features)
        self.X_features = Fext.features_extracted
        self.y_train, self.y_test, self.X_test = Fext.y_train, Fext.y_test, Fext.X_test
        # calculate priors here
        
        
    def use_model(self, model_type):
        # model_type is the type of model. KernelDensity or GaussianMixture (from sklearn) work as values
        if ( model_type is sklearn.neighbors.KernelDensity ):
            Model = LikelihoodModel(self.X_features, self.y_train, sklearn.neighbors.KernelDensity)
            Model.fit()
            self.model_dict = Model.model_dict
        elif ( model_type is sklearn.mixture.GaussianMixture ):
            Model = LikelihoodModel(self.X_features, self.y_train, sklearn.mixture.GaussianMixture)
            Model.fit()
            self.model_dict = Model.model_dict
        # now calculate likelihoods
        frequencies = pd.Series(self.y_train).value_counts(normalize=True)
        
    
    
    def predict(self):
        predictions = []
        # this function will do the actual posterior class prediction 
        # we add the log_priors + log_likelihoods = log_posteriors (up to proportion) 
        if is_image(self.X_test):
            X_test_reshaped = np.reshape(self.X_test, (len(self.X_test), -1))
            for i in range(0, X_test_reshaped.shape[0]):
                test_point = X_test_reshaped[i,:]
                # for each row (data-point) in the test data-matrix
                
        elif is_tabular(self.X_test):
            for i in range(0, X_test.shape[0]):
                test_point = self.X_test[i,:]
                
        #pass
    
    def visualize(self):
        pass


    
    

    
    
    

# ----------------------------------------the likelihood model-building class-------------------------------------

class LikelihoodModel:
    def __init__(self, X, y, model_type):
        self.X = X
        self.y = y
        self.categories = np.unique(y)
        self.model_dict = None 
        self.model_type = model_type # the model inputted here 
    
    def fit(self):
        category_models = []
        for category in self.categories:
            category_index = np.where( self.y==category )
            X_category = self.X[category_index]
            # get input about parameter grid used on model
            # now check for the model type
            if ( self.model_type is sklearn.neighbors.KernelDensity ):
                # fill in parameter dictionary
                key = input('Enter parameter name: ')
                start = float(input('Enter start value: '))
                end = float(input('Enter end value: '))
                step = float(input('# of points? '))
                param_grid = {key: np.logspace(start, end, step)}
                # grid-search
                optimal_model = find_best_params( X_category, KernelDensity(kernel='gaussian'), param_grid )
                category_models.append( optimal_model )
            # otherwise if the model is Gaussian Mixture 
            elif ( self.model_type is sklearn.mixture.GaussianMixture ):
                # fill in parameter dictionary
                key = input('Enter parameter name: ')
                n_components = int(input('# of components? ')) 
                param_grid = {key: np.asarray(list(range(1, n_components+1))) }
                # grid-search
                optimal_model = find_best_params( X_category, GaussianMixture(n_components=n_components), param_grid )
                category_models.append( optimal_model )    
        # put all the models for each category into the model_dict instance variable        
        self.model_dict = {('Model {i}'.format(i)):m for (i,m) in enumerate(category_models)}
        
    def visualize(self):
        # first, choose a model from the self.model_dict
        
        pass
        
        

#----------------------------------------the feature extractor class----------------------------------------
        
class FeatureExtractor:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.features_extracted = None
        self.feature_names = None
        
    def train_test_split(self, train_percent=70):
        self.X_train, self.X_test,\ 
        self.y_train, self.y_test = sklearn.model_selection.train_test_split(self.X, self.y, 
                                                                             train_size=(train_percent/100),
                                                                            random_state=2)
    def get_features(self, num_features=2):
        if is_tabular(self.X_train):
            # if data is tabular
            # we can use random forest feature_importance method
            if ( numpy.isnan(self.X_train).any() or numpy.isnan(self.y_train).any() ):
                # first check if any missing values
                # so if either condition is true, there are missing values
                return 'Your data-matrix or target-class has missing values. Please fill them in first.'
            else:
                # data is all complete, do random forest feature extractor
                RF = RandomForestClassifier(random_state=2)
                RF.fit(self.X_train, self.y_train)
                self.features_extracted = RF.feature_importances_[0:num_features] #need to sort from highest to lowest
                print('{} best features extracted!'.format(num_features))
        elif is_image(self.X_train):
            # we can only use UMAP
            # we need to first reshape self.X_train from 3D numpy array to 2D array
            X_train_reshaped = np.reshape(self.X_train, (len(self.X_train), -1))
            # then use UMAP
            mapper = umap.UMAP(n_components=num_features, random_state=2).fit(X_train_reshaped)
            umap_features = mapper.transform(X_train_reshaped)
            self.features_extracted = umap_features
            print('{} best features extracted!'.format(num_features))
            

            
            
#-----------------------------------------auxiliary functions-------------------------------------------

def find_best_params(X, model_inst, param_grid):
    if ( isinstance(model_inst,sklearn.neighbors.KernelDensity) ):
        grid = GridSearchCV( estimator=model_inst, 
                                param_grid=param_grid, n_jobs=-1 )
        grid.fit(X)
        best_model = sklearn.neighbors.KernelDensity(kernel='gaussian', grid.best_params[list(param_grid)[0]])
        best_model.fit(X)
    elif ( isinstance(model_inst,sklearn.mixture.GaussianMixture) ):
        grid = GridSearchCV( estimator=model_inst, 
                                param_grid=param_grid, n_jobs=-1 )
        grid.fit(X)
        best_model = sklearn.mixture.GaussianMixture(n_components=grid.best_params[list(param_grid)[0]])
        best_model.fit(X)
    else:
        return 'Error handling user-inputted model instance name.'
    return best_model


def is_tabular(X):
    # check the dimensions of the data-matrix
    # if it is 2-dimensional numpy array, its tabular
    if (X.ndim == 2):
        return True
    elif:
        return False

def is_image(X):
    # check the dimensions of the data-matrix
    # if it is 3-dimensional numpy array, its image
    if (X.ndim == 3):
        return True
    elif:
        return False                  
            

In [77]:
isinstance(GaussianMixture(n_components=5), sklearn.mixture.GaussianMixture)

True

In [None]:
Kern