In [None]:
# Import the MSCDB api
import mascdb.api
from mascdb.api import MASC_DB

# Import other libraries
from IPython.display import display
import pyarrow

# Import files
from helpers import *
from models import *
from cross_validation import *

# Autoreload
%load_ext autoreload
%autoreload 2
  

# Drop duplicates

***

In [None]:
# Set the classifier
classifier  = 'hydro'

# Get the data and the correpsonding classes
mascdb_data = pd.read_pickle('Data/data_set.pkl')
mascdb_classes = pd.read_pickle('Data/classes.pkl').reset_index(drop = False)

In [None]:
# Get all the wrong duplicates flakes
mascdb_classes_copy = mascdb_classes.copy()

mascdb_classes_copy_1 = mascdb_classes_copy[mascdb_classes_copy.duplicated(subset = None, keep = False)]
mascdb_classes_copy_2 = mascdb_classes_copy[mascdb_classes_copy.duplicated(subset=['flake_id'], keep = False)]

mascdb_classes_wrong_duplicates = pd.concat([mascdb_classes_copy_1, mascdb_classes_copy_2]).drop_duplicates(keep = False)

In [None]:
display(mascdb_classes_wrong_duplicates)

In [None]:
# Get the flake id of the wrong duplicates
mascdb_classes_wrong_duplicates_unique = mascdb_classes_wrong_duplicates.drop_duplicates(subset = ['flake_id'], keep = 'first')

# Get all the flake id with classes
flake_id_classes = mascdb_classes_copy.drop_duplicates(subset=['flake_id'], keep = 'first')

# Remove the wrong flake id from all the flake id
mascdb_classes_modified = pd.concat([flake_id_classes, mascdb_classes_wrong_duplicates_unique]).drop_duplicates(subset=['flake_id'], keep = False)

In [None]:
# Now, we want to be sure to have one class for each snowflakes
mascdb_data_modified = mascdb_data[mascdb_data.flake_id.isin(mascdb_classes_modified.flake_id)]

#### Transform the data (standardization)

***

In [None]:
# Transform the data
mascdb_data_modified_copy = mascdb_data_modified.copy()
power_transformer = preprocessing.PowerTransformer(method = 'yeo-johnson', standardize = True)
mascdb_data_modified_std = power_transformer.fit(mascdb_data_modified_copy.drop(['flake_id'], axis=1))
mascdb_data_modified_std = power_transformer.transform(mascdb_data_modified_copy.drop(['flake_id'], axis=1))

# Set the transformed data
mascdb_data_modified[mascdb_data_modified.columns.difference(['flake_id'])]  = mascdb_data_modified_std

# Modelization

***

In [None]:
# Split into a data set X_ and a response set y_
X_ = mascdb_data_modified[mascdb_data_modified.columns.difference(['flake_id'])]
y_ = mascdb_classes_modified.copy().set_index('flake_id')

# Get a column as flake_id
X_['flake_id'] = X_.index

# Supress all the duplicates flake_id and get the correponding class
X_ = X_.drop_duplicates(subset = 'flake_id', keep = 'first').join(y_)

In [None]:
# Split into a data set X and a response set y
y = pd.DataFrame(X_['class_id'])
X = X_[X_.columns.difference(['flake_id', 'class_id'])]

# Split data and Transform
***

### Using our split data 

In [None]:
# Get a train and test set for modelization
X_train, y_train, X_test, y_test = split_data(X, y, n_s = 5)

In [None]:
# Get a train and test set for modelization
X_train_bis, X_test_bis, y_train_bis, y_test_bis = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# transforme the y_train and the y_test after split because StratifiedKFold can only deal with 1D array
# can be used for SVM but work also well without the transformed y
y_train_transformed = classes_transformed(y_train)
y_test_transformed = classes_transformed(y_test)

# SMOTE for oversampling imbalanced classification datasets
***

In [None]:
X_train_rs, y_train_rs = smote_data_augmentation(X_train, y_train)

# Feature augmentation and selection 
***

Feature importance 
https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
estimator = 
sfs = SequentialFeatureSelector(estimator).fit(X_train, y_train)
X_train_selec = sfs.transform(X_train)
X_test_selec = sfs.transform(X_test)

In [None]:
#feature augmentation
poly = PolynomialFeatures(2)
X_poly = pd.DataFrame(poly.fit_transform(X))

In [None]:
#split data after feature augmentation
X_poly_train, y_train, X_poly_test, y_test = split_data(X_poly, y, n_s = 5)

In [None]:
X_poly_train_rs, y_train_rs = smote_data_augmentation(X_poly_train, y_train)

In [None]:
#model selection without feature augmentation
model_selec = get_model_features_selection(X_train_rs, y_train_rs, 'recursiveCV', 5)
X_train_selec = model_selec.transform(X_train_rs)
X_test_selec = model_selec.transform(X_test)

In [None]:
#model selection without feature augmentation
model_selec = get_model_features_selection(X_poly_train_rs, y_train_rs, 'recursive')
X_poly_train_selec = model_selec.transform(X_poly_train_rs)
X_poly_test_selec = model_selec.transform(X_poly_test)

# Logistic Regression 
***

### Tune regularization for multinomial logistic regression

In [None]:
LR, param = get_model_LR(True)
cv = evaluate_model(LR, param, X_train_rs, y_train_rs, X_test, y_test)

In [None]:
save_model('LR.pkl', clf)

In [None]:
model = load_model('LR.pkl')

In [None]:
display(model.best_params_)

# SVM

***

In [None]:
svm, param = get_model_SVM(True)
evaluate_model(svm, param, X_train, y_train, X_test, y_test)