# Anomaly Detection with Gaussian Mixture Model - Part 3

In the previous part we tried to come up with a learning curve for our model in order to be able to choose the best parameters. 

In this part we'll stand on the shoulders of giants and use BIC (Bayesian Information Criterion) to guide us in selecting the best model.

In [23]:
import numpy as np
import sklearn as sk
from sklearn import mixture
import pandas as pd
%pylab inline

pd.set_option('display.mpl_style', 'default')
plt.rcParams["figure.figsize"] = (18,4)


Populating the interactive namespace from numpy and matplotlib


In [24]:
def get_train_data(data_size):
    m=data_size/5
    df1 = pd.DataFrame(random.randn(m,2)-20, columns=['x','y'])
    df2 = pd.DataFrame(random.randn(m*2,2)-10, columns=['x','y'])
    df2.y=-df2.x*3-10 + random.randn(m*2)*2    
    df3 = pd.DataFrame(random.randn(m,2), columns=['x','y'])
    df3.y=df3.x*2-10 + random.randn(m)
    return df1.append(df2).append(df3)
    
def train_anomaly_model(df, n_components=3, cov_type='full'):
    model = mixture.GMM(n_components, covariance_type=cov_type)
    model.fit(df[['x','y']])
    return model

def find_anomaly_limit(df, model, n_falsePositives):
    scores=model.score(df[['x','y']])

    percentile=100*(n_falsePositives / len(scores))

    return np.percentile(np.exp(scores),percentile)

def choose_best_model(df):    
    (min_bic,min_model) = (1e100, 'unknown')
    for n_components in np.arange(5)+1:
        for cov_type in ['spherical', 'tied', 'diag', 'full']:
            model = train_anomaly_model(df, n_components, cov_type)        
            bic = model.bic(df)
            if (bic < min_bic):
                (min_bic,min_model) = (bic, model)
    return min_bic, min_model

df = get_train_data(1000)
bic, model = choose_best_model(df)
print 'model = {}\nbic = {}'.format(model, bic)


model = GMM(covariance_type='full', init_params='wmc', min_covar=0.001,
  n_components=3, n_init=1, n_iter=100, params='wmc', random_state=None,
  thresh=None, tol=0.001, verbose=0)
bic = 6918.36434336


Rather reassuringly the same parameters (3, full) were selected as in the previous part.

Great.