In [None]:
# RFC predicts variability labels from input set (basic stellar prop.)

In [1]:
import warnings 
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from pushkin.train import train_rf
from pushkin.classify import classify
from os import path
from tqdm import tqdm

from imblearn.over_sampling import SMOTEN, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv("../output/rf_train.csv")
vt = pd.read_csv("../output/rf_labels.csv")

In [3]:
cols = ['KIC', 'Mass', 'Teff', 'logg', '[Fe/H]', 'Rad', 'rho', 'Lum', 'Age', 'f_Age', 'Avmag', 'GOF', 'TAMS']

In [4]:
# Split training and test sets

sc = StandardScaler()

x_labelled = df[cols] # no flux data, only prop
y_labelled = vt

x_og = x_labelled.iloc[:, 1:]
y_og = y_labelled.iloc[:, 1:]

x_og_t = sc.fit_transform(x_og)

In [5]:
y_og.value_counts()

vt
nv    84265
lt    10072
st     6103
mt     4542
eb     1515
dtype: int64

In [None]:
over_strat = {'lt': 20000, 'st': 20000, 'mt':20000, 'eb':20000}
under_strat = {'nv':20000}

In [43]:
# define pipeline for over then undersampling
over = 
under = RandomUnderSampler(sampling_strategy=under_strat, random_state=0)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
x_smote, y_smote = pipeline.fit_resample(x_og, y_og)

In [None]:
smote = SMOTEN(sampling_strategy=over_strat, random_state=0)
x_smote, y_smote = pipeline.fit_resample(x_og, y_og)

# similar to smote, slightly better
border = BorderlineSMOTE(random_state=0)
x_border, y_border = border.fit_resample(x_og, y_og)

# bad
svm = SVMSMOTE(random_state=0)
x_svm, y_svm = svm.fit_resample(x_og, y_og)

# also bad
adasyn = ADASYN(random_state=0)
x_adasyn, y_adasyn = adasyn.fit_resample(x_og, y_og)

In [78]:
# combined over and undersampling

sme = SMOTEENN(random_state=0)
x_sme, y_sme = sme.fit_resample(x_og, y_og)

tomek = SMOTETomek(random_state=0)
x_tomek, y_tomek = tomek.fit_resample(x_og, y_og)

In [7]:
x, y = x_tomek, y_tomek
x_t = sc.transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)

x_train_t = sc.transform(x_train)
x_test_t = sc.transform(x_test)
x_val_t = sc.transform(x_val)

In [8]:
# Random Forest Classifier

rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=100, max_features='sqrt')
rf.fit(x_train_t, y_train)

importances = pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)
print(importances)

KeyboardInterrupt: 

In [None]:
# Evaluating the model

label_encoder = LabelEncoder()
label_encoder.fit_transform(y_train)

rfc_cv_score = cross_val_score(rf, x_val_t, label_encoder.transform(y_val), cv=10, scoring='roc_auc', error_score='raise')
og_score = cross_val_score(rf, x_og_t, label_encoder.transform(y_val), cv=10, scoring='roc_auc', error_score='raise')

In [None]:
y_inp = y_og

y_pred = rf.predict(x_og_t)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_inp, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_inp, y_pred))
print('\n')
print("=== All AUC Scores ===")
print(og_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score: ", og_score.mean())
print('\n')
print("=== Balanced Accuracy Score ===")
print("Balanced Accuracy Score: ", balanced_accuracy_score(y_inp, y_pred))

In [None]:
y_inp = y_train

y_pred = rf.predict(x_train_t)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_inp, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_inp, y_pred))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score: ", rfc_cv_score.mean())
print('\n')
print("=== Balanced Accuracy Score ===")
print("Balanced Accuracy Score: ", balanced_accuracy_score(y_inp, y_pred))

In [None]:
# Visualize confusion matrix

y_inp = y_og
y_pred = rf.predict(x_og_t)

# Get and reshape confusion matrix data
matrix = confusion_matrix(y_inp, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = rf.classes_.tolist()
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

In [None]:
from dtreeviz.trees import *

i = 0

label_encoder = LabelEncoder()
label_encoder.fit_transform(y_train)

viz = dtreeviz(rf.estimators_[i], 
               x_data=x_train.values,
               y_data=label_encoder.fit_transform(y_train),
               target_name='vt',
               feature_names=x.columns.tolist(), 
               class_names=label_encoder.classes_.tolist(), 
               title="Decision Tree - Balanced Training Set")

In [None]:
viz

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt', 'log2']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)

In [None]:
# Fit the model
rfc_random.fit(x_train_t, y_train)
# print results
print(rfc_random.best_params_)

In [None]:
# Plot test vs val score based on max_depth

param=[2, 4, 8, 16, 32, 64, None]
train = []
test = []
val = []

for x in param:
    rf = RandomForestrf(n_estimators=100, max_depth=x, max_features='sqrt', min_samples_split=3)
    rf.fit(x_train_t, y_train)
    train.append(r2_score(rf.predict(x_train_t), y_train))
    test.append(r2_score(rf.predict(x_test_t), y_test))
    val.append(r2_score(rf.predict(x_val_t), y_val))    

train_plot = plt.scatter(param, train)
test_plot = plt.scatter(param, test)
val_plot = plt.scatter(param, val)

plt.legend((train_plot, test_plot, val_plot),
           ('Train', 'Test', 'Validation'),
           scatterpoints=1,
           loc='best',
           fontsize=8)

# plt.xlim(0, 22)
plt.ylim(0, 1)
plt.xlabel('N_estimators')
plt.ylabel('Score')

plt.show()

In [None]:
n_train = 1
abspath = path.abspath ('..') + '/Pushkin/'
classify_missing = False

PATH_OUT_CLASSIFIER = abspath + 'data/rf_storage/'
PATH_SAMPLES = abspath + '/data/input_samples/'
file_train = 'kepler_km_sample.csv'

file_flag = 'kepler_km_full_flag.csv'
PATH_OUT = abspath + 'data/results_training/filter/'
fileout_result = 'run_' + str(ii+1) + '_km_filter_frame.csv'


# SWITCH TO TRUE TO SAVE THE SUMMARY FRAME AND THE CLASSIFIER
summary = True
save = True
plot = True # will show only the first plot not to overhelm the notebook

class_weight = 'balanced'
test_size = 0.25
clf = train_rf (df, clf_name=PATH_OUT_CLASSIFIER+fileout_classifier, frame_name=PATH_OUT+fileout_result,
        n_estimators=300, criterion='gini', min_samples_split=2, plot=plot,
        class_weight=class_weight, test_size=test_size, summary=summary, save=save, random_state=ii, verbose=1)
