In [19]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import mixture
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from aitia_explorer.app import App

# stop the warning clutter
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_gmm_sample_data(incoming_df, column_list, sample_size):
    """
    Unsupervised Learning in the form of BayesianGaussianMixture to create sample data.
    """
    gmm = mixture.BayesianGaussianMixture(n_components=2,
                                          covariance_type="full",
                                          n_init=100,
                                          random_state=42).fit(incoming_df)
    clustered_data = gmm.sample(sample_size)
    clustered_df = pd.DataFrame(clustered_data[0], columns=column_list)
    return clustered_df

def get_synthetic_training_data(incoming_df):
    """
    Creates synthetic training data by sampling from a BayesianGaussianMixture supplied distribution.
    Synthetic data is then labelled differently from the original data.
    """
    # number of records in df
    number_records = len(incoming_df.index)

    # get sample data from the unsupervised BayesianGaussianMixture
    df_bgmm = get_gmm_sample_data(incoming_df, list(incoming_df), number_records)

    # set the class on the samples
    df_bgmm['original_data'] = 0

    # add the class to a copy of incoming df, stops weird errors due to changed dataframes
    working_df = incoming_df.copy(deep=True)
    working_df['original_data'] = 1

    # concatinate the two dataframes
    df_combined = working_df.append(df_bgmm, ignore_index=True)

    # shuffle the data
    df_combined = df_combined.sample(frac=1)

    # get the X and y
    x = df_combined.drop(['original_data'], axis=1).values
    y = df_combined['original_data'].values
    y = y.ravel()

    return x, y

In [3]:
aitia = App()

In [5]:
df = aitia.data.hepar2_10k_data()

In [6]:
# get ths synthetic data
X, y = get_synthetic_training_data(df)

In [16]:
estimator = GradientBoostingClassifier()
selector = RFE(estimator, n_features_to_select=5, step=3)
selector = selector.fit(X, y)

In [17]:
# sort the feature indexes
feature_indices = []

for i in range(df.shape[1]):
    # see if column has been marked true or false
    if selector.support_[i]:
        feature_indices.append(i)

requested_features = [list(df)[i] for i in feature_indices]

In [18]:
df_reduced = df[requested_features]
df_reduced

Unnamed: 0,RHepatitis,proteins,hbsag_anti,consciousness,hbeag
0,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
1,-0.167845,7.365366,-0.142857,-0.164669,-0.060108
2,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
3,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
4,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
...,...,...,...,...,...
9995,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
9996,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
9997,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108
9998,-0.167845,-0.135771,-0.142857,-0.164669,-0.060108


In [20]:
selector = SelectFromModel(estimator=GradientBoostingClassifier()).fit(X, y)

In [21]:
selector.estimator_.coef_

AttributeError: 'GradientBoostingClassifier' object has no attribute 'coef_'