In [None]:
# RFC predicts variability labels from input set (basic stellar prop.)

In [None]:
import warnings 
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from pushkin.train import train_rf
from pushkin.classify import classify
from os import path
from tqdm.notebook import tqdm

from imblearn.over_sampling import SMOTEN, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv("../output/rf_train.csv")
vt = pd.read_csv("../output/rf_labels_2.csv")

In [None]:
simp = vt.replace('st', 'var')
simp = simp.replace('mt', 'var')
simp = simp.replace('lt', 'var')

In [None]:
df = df[simp['vt'] != 'eb']
simp = simp[simp['vt'] != 'eb']

In [None]:
cols = ['KIC', 'Mass', 'Teff', 'logg', '[Fe/H]', 'Rad', 'rho', 'Lum', 'Age', 'f_Age', 'Avmag', 'GOF', 'TAMS']

In [None]:
# Split training and test sets

sc = StandardScaler()

x_labelled = df[cols] # no flux data, only prop
y_labelled = vt

x_og = x_labelled.iloc[:, 1:]
y_og = y_labelled.iloc[:, 1:]

x_og_t = sc.fit_transform(x_og)

In [None]:
y_og.value_counts()

In [None]:
over_strat = {'lt': 10000, 'mt':10000, 'eb':10000}
under_strat = {'nv':10000, 'st': 10000}

In [None]:
# define pipeline for over then undersampling
over = BorderlineSMOTE(sampling_strategy=over_strat, random_state=0)
under = RandomUnderSampler(sampling_strategy=under_strat, random_state=0)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
x, y = pipeline.fit_resample(x_og, y_og)

In [None]:
smote = SMOTEN(random_state=0)
x_smote, y_smote = smote.fit_resample(x_og, y_og)

# similar to smote, slightly better
border = BorderlineSMOTE(random_state=0)
x_border, y_border = border.fit_resample(x_og, y_og)

# bad
svm = SVMSMOTE(random_state=0)
x_svm, y_svm = svm.fit_resample(x_og, y_og)

# also bad
adasyn = ADASYN(random_state=0)
x_adasyn, y_adasyn = adasyn.fit_resample(x_og, y_og)

In [None]:
# combined over and undersampling

sme = SMOTEENN(random_state=0)
x_sme, y_sme = sme.fit_resample(x_og, y_og)

tomek = SMOTETomek(random_state=0)
x_tomek, y_tomek = tomek.fit_resample(x_og, y_og)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)

x_train_t = sc.transform(x_train)
x_test_t = sc.transform(x_test)
x_val_t = sc.transform(x_val)

In [None]:
# Random Forest Classifier

rf = RandomForestClassifier(n_estimators=1400, criterion='gini', max_depth=100, min_samples_split=100, max_features='sqrt')
rf.fit(x_train_t, y_train)

importances = pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)
print(importances)

In [None]:
# Evaluating the model

rfc_cv_score = cross_val_score(rf, x_val_t, label_binarize(y_val, classes=rf.classes_.tolist()), cv=10, scoring='roc_auc', error_score='raise')

In [None]:
y_inp = y_val
y_pred = rf.predict(x_val_t)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_inp, y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_inp, y_pred))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score: ", rfc_cv_score.mean())
print('\n')
print("=== Balanced Accuracy Score ===")
print("Balanced Accuracy Score: ", balanced_accuracy_score(y_inp, y_pred))

In [None]:
# Visualize confusion matrix

y_inp = y_og
y_pred = rf.predict(x_og_t)

# Get and reshape confusion matrix data
matrix = confusion_matrix(y_inp, y_pred)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = rf.classes_.tolist()
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(y_train)

In [None]:
# Visualize decision tree

from dtreeviz.trees import *

i = 0

viz = dtreeviz(rf.estimators_[i], 
               x_data=x_train.values,
               y_data=label_encoder.fit_transform(y_train),
               target_name='vt',
               feature_names=x.columns.tolist(), 
               class_names=label_encoder.classes_.tolist(), 
               title="Decision Tree - Balanced Training Set")

In [None]:
viz.view()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['sqrt', 'log2']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 5, verbose=2, random_state=42, n_jobs = -1)

In [None]:
# Fit the model
rfc_random.fit(x_train_t, label_encoder.fit_transform(y_train.values))

# print results
print(rfc_random.best_params_)
print(rfc_random.best_score_)

In [None]:
# Plot test vs val score based on max_depth

param=[2, 4, 8, 16, 32, 64, None]
train = []
test = []
val = []

for x in param:
    rf = RandomForestrf(n_estimators=100, max_depth=x, max_features='sqrt', min_samples_split=3)
    rf.fit(x_train_t, y_train)
    train.append(r2_score(rf.predict(x_train_t), y_train))
    test.append(r2_score(rf.predict(x_test_t), y_test))
    val.append(r2_score(rf.predict(x_val_t), y_val))    

train_plot = plt.scatter(param, train)
test_plot = plt.scatter(param, test)
val_plot = plt.scatter(param, val)

plt.legend((train_plot, test_plot, val_plot),
           ('Train', 'Test', 'Validation'),
           scatterpoints=1,
           loc='best',
           fontsize=8)

# plt.xlim(0, 22)
plt.ylim(0, 1)
plt.xlabel('N_estimators')
plt.ylabel('Score')

plt.show()