In [None]:
# Import the MSCDB api
import mascdb.api
from mascdb.api import MASC_DB

# Import other libraries
from IPython.display import display
import numpy as np
import pandas as pd
import pyarrow
import matplotlib.pyplot as plt

# Import sklearn tools
from sklearn.model_selection import *
from sklearn.metrics import accuracy_score
from sklearn import *
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Import files
from helpers import *

# Autoreload
%load_ext autoreload
%autoreload 2

In [None]:
masc_path = 'Data/MASCDB'
mascdb_features = MASC_DB(masc_path)

In [None]:
print(mascdb_features)
len(mascdb_features)
mascdb_features.cam0.sns.boxplot(x = "Dmax")

**Remark:** Par exemple, on peut avoir 1100 flocons qui sont classifiés pour la cam 2, mais seulement 815 sont dans le parquet file. La raison veint du fait que la classification a été faite sur les images et donc les données numériques n'ont pas été nécéssairement produites poru toutes les photos.

In [None]:
# Get the columns to delelte for our experiences
black_list_words = ['roi', 'riming', 'melting', 'snowflake', 'hl']
cols_to_delete = list(filter(lambda cols: any(word in cols for word in black_list_words), mascdb_features.cam0.columns))
cols_to_delete.extend(['datetime', 'pix_size', 'flake_number_tmp', 'event_id'])

In [None]:
mascdb_features_filt = mascdb_features.drop_cam_columns(cols_to_delete)
display(mascdb_features_filt.cam0.dtypes)

**Remark:** Normal d'avoir deux fois la colonne `flake_id` dans notre dataframe. 

In [None]:
# Get train set
mascdb_classes = MASCDB_classes("Data")
cam0_train_set = mascdb_classes.get_sub_data_cam("hydro", 0, mascdb_features.cam0)
display(mascdb_classes.hydro_cam0.head(10))
for i in range(1, 7):
    display(mascdb_classes.hydro_cam0[mascdb_classes.hydro_cam0.class_id == i].shape)

In [None]:
#Standardization of the data
classified_data = mascdb_classes.get_classified_data("hydro", mascdb_features_filt)
pt = preprocessing.PowerTransformer(method='yeo-johnson', standardize=True)
classified_data_ = pt.fit(classified_data.drop(['flake_id'], axis=1))
classified_data_ = pt.transform(classified_data.drop(['flake_id'], axis=1))
display(classified_data_)

In [None]:
classified_data[classified_data.columns.difference(['flake_id'])]  = classified_data_

In [None]:
classified_data

In [None]:
classes = mascdb_classes.get_classses("hydro", mascdb_features_filt)
display(classes)

# Just a small test

***

In [None]:
# The idea is just to have a brut test / train separator
X_train, X_test, y_train, y_test = train_test_split(
    classified_data[classified_data.columns.difference(['flake_id'])], classes[classes.columns.difference(['flake_id'])], test_size=0.33, random_state=42)

In [None]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

# Feature selection test
***

In [None]:
# The idea is just to have a brut test / train separator
X_train, X_test, y_train, y_test = train_test_split(
    classified_data[classified_data.columns.difference(['flake_id'])], classes[classes.columns.difference(['flake_id'])], test_size=0.33, random_state=22)

In [None]:
X_selec = features_selection(X_train, y_train, 'recursiveCV', 2)

In [None]:
display(X_train.shape)

In [None]:
display(X_selec.shape)

In [None]:
X_recursive = features_selection(X_train, y_train, 'recursive', 2)

In [None]:
display(X_recursive.shape)

In [None]:
X_lasso = features_selection(X_train, y_train, 'lasso', 0.1, True)
display(X_lasso.shape)

Weird: when param >= 2 for method = 'lasso', all the features are putted to zero!

In [None]:
X_lassoCV = features_selection(X_train, y_train, 'lassoCV', 4, True)
display(X_lassoCV.shape)

In [None]:
X_PCA = features_selection(X_train, y_train, 'PCA', 10, True)
print(X_PCA.shape)