In [1]:
# Import the MSCDB api
import mascdb.api
from mascdb.api import MASC_DB

# Import other libraries
from IPython.display import display
import numpy as np
import pandas as pd
import pyarrow

# Import sklearn tools
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn import *
from sklearn.pipeline import *
from sklearn.preprocessing import *
from sklearn.ensemble import *
from sklearn.svm import *
from sklearn.preprocessing import * 

# Import files
from helpers import *
from cross_validation import *
from models import *

# Autoreload
%load_ext autoreload
%autoreload 2

# Data cleaning

***

In [None]:
masc_path = 'Data/MASCDB'
mascdb_features = MASC_DB(masc_path)

# Get train set
Mascdb_classes = MASCDB_classes("Data")

In [None]:
# Get the columns to delete for our experiences
black_list_words = ['roi', 'riming', 'melting', 'snowflake', 'hl']
cols_to_delete = list(filter(lambda cols: any(word in cols for word in black_list_words), mascdb_features.cam0.columns))
cols_to_delete.extend(['datetime', 'pix_size', 'flake_number_tmp', 'event_id'])
mascdb_features_filt = mascdb_features.drop_cam_columns(cols_to_delete)

# Drop duplicates

***

In [None]:
# Set the classifier
classifier  = 'hydro'

# Get the data and the correpsonding classes
mascdb_data = Mascdb_classes.get_classified_data('hydro', mascdb_features_filt)
mascdb_classes = Mascdb_classes.get_classes('hydro', mascdb_features_filt)

display(mascdb_data)
display(mascdb_classes)

In [None]:
# Get all the wrong duplicates flakes
mascdb_classes_copy = mascdb_classes.copy()

mascdb_classes_copy_1 = mascdb_classes_copy[mascdb_classes_copy.duplicated(subset = None, keep = False)]
mascdb_classes_copy_2 = mascdb_classes_copy[mascdb_classes_copy.duplicated(subset=['flake_id'], keep = False)]

mascdb_classes_wrong_duplicates = pd.concat([mascdb_classes_copy_1, mascdb_classes_copy_2]).drop_duplicates(keep = False)

In [None]:
# Get the flake id of the wrong duplicates
mascdb_classes_wrong_duplicates_unique = mascdb_classes_wrong_duplicates.drop_duplicates(subset = ['flake_id'], keep = 'first')

# Get all the flake id with classes
flake_id_classes = mascdb_classes_copy.drop_duplicates(subset=['flake_id'], keep = 'first')

# Remove the wrong flake id from all the flake id
mascdb_classes_modified = pd.concat([flake_id_classes, mascdb_classes_wrong_duplicates_unique]).drop_duplicates(subset=['flake_id'], keep = False)

In [None]:
# Now, we want to be sure to have one class for each snowflakes
mascdb_data_modified = mascdb_data[mascdb_data.flake_id.isin(mascdb_classes_modified.flake_id)]

#### Transform the data (standardization)

***

In [None]:
# Transform the data
mascdb_data_modified_copy = mascdb_data_modified.copy()
power_transformer = preprocessing.PowerTransformer(method = 'yeo-johnson', standardize = True)
mascdb_data_modified_std = power_transformer.fit(mascdb_data_modified_copy.drop(['flake_id'], axis=1))
mascdb_data_modified_std = power_transformer.transform(mascdb_data_modified_copy.drop(['flake_id'], axis=1))

# Set the transformed data
mascdb_data_modified[mascdb_data_modified.columns.difference(['flake_id'])]  = mascdb_data_modified_std

# Modelization

***

In [None]:
# Split into a data set X_ and a response set y_
X_ = mascdb_data_modified[mascdb_data_modified.columns.difference(['flake_id'])]
y_ = mascdb_classes_modified.copy().set_index('flake_id')

# Get a column as flake_id
X_['flake_id'] = X_.index

# Supress all the duplicates flake_id and get the correponding class
X_ = X_.drop_duplicates(subset = 'flake_id', keep = 'first').join(y_)

In [None]:
# Split into a data set X and a response set y
y = pd.DataFrame(X_['class_id'])
X = X_[X_.columns.difference(['flake_id', 'class_id'])]

In [None]:
# Get a train and test set for modelization
X_train, y_train, X_test, y_test = split_data(X, y, n_s = 5)