In [40]:
import os
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import itertools
import numpy as np
from scipy.interpolate import RBFInterpolator, InterpolatedUnivariateSpline
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

data_folder = 'INAF_case_data'

# 0. Read and Combine files from all folders

In [41]:
folders = ['learning', 'test', 'slab']
df = pd.DataFrame()
for folder in folders:
    learning_folder = os.path.join(data_folder, folder)
    for json_file in os.listdir(learning_folder):
        file_name = os.path.join(learning_folder, json_file)
        with open(file_name) as f:
            content = json.load(f)
            a = pd.DataFrame(content.items(), columns=['key', 'values'])
            b = a[a['key']=='spectrum']['values']
            b = b.apply(pd.Series)
            b['file'] = json_file
            b['folder'] = folder

            c = a[a['key']=='abundances']['values']
            if not c.empty:
                if c.values:
                    b['abundances'] = c.values
            df = pd.concat([df, b], ignore_index=True)

df['abundances_len'] = df['abundances'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['label'] = df['abundances_len']>0

In [42]:
df.head()

Unnamed: 0,wavelength,reflectance,error,file,folder,abundances,abundances_len,label
0,"[320.0, 321.0, 322.0, 323.0, 325.0, 326.0, 327...","[0.2734, 0.27788, 0.28235, 0.28682, 0.29577, 0...","[0.0518, -1.0, -1.0, -1.0, 0.03897, -1.0, -1.0...",c1ag41.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,True
1,"[300.0, 301.0, 302.0, 303.0, 305.0, 306.0, 307...","[0.118, 0.11835, 0.11871, 0.11906, 0.11978, 0....","[0.02806, -1.0, -1.0, -1.0, 0.02179, -1.0, -1....",c1kc09.json,learning,"[{'mineral_phase_name': 'Clinopyroxene', 'perc...",2,True
2,"[300.0, 301.0, 302.0, 303.0, 305.0, 306.0, 307...","[0.05399, 0.05409, 0.0542, 0.0543, 0.05452, 0....","[0.07406, -1.0, -1.0, -1.0, 0.05048, -1.0, -1....",c1dd02.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",2,True
3,"[320.0, 321.0, 322.0, 323.0, 325.0, 326.0, 327...","[0.25872, 0.26341, 0.2681, 0.27279, 0.28218, 0...","[0.05129, -1.0, -1.0, -1.0, 0.0387, -1.0, -1.0...",c1ag40.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,True
4,"[320.0, 321.0, 322.0, 323.0, 325.0, 326.0, 327...","[0.28346, 0.2875, 0.29154, 0.29558, 0.30366, 0...","[0.04664, -1.0, -1.0, -1.0, 0.03789, -1.0, -1....",c1ag31.json,learning,"[{'mineral_phase_name': 'Plagioclase', 'percen...",3,True


In [43]:
df.shape

(683, 8)

## 1.3. Filter Samples having Labels and Remove the wrong ones

## 1.4. Filter Training Set (151 -> 139 files)

In [44]:
# filter samples having labels (length of abundances > 0, and remove the sample with 7 abundances)
a = df[(df['abundances_len']>0) & (7>df['abundances_len'])].copy() #683 -> 151 files
print(f'#samples with labels (abundances): {a.shape[0]}')

# check sum of % if 100
a['100%'] = a['abundances'].apply(lambda x: sum(i['percentage'] for i in x)==100)
c = a[a['100%']].copy() # 140 files
print(f'#samples with 100% of abundances: {c.shape[0]}')

# discard sample whose mineral_phase_names are duplicated in one sample
c['dup_label'] = c['abundances'].apply(lambda x: len(x)!=len(set(i['mineral_phase_name'] for i in x)))
df_labels = c[~c['dup_label']].copy() # 139 files
print(f'#samples after removed duplicated mineral_phase_name: {df_labels.shape[0]}')

# split the phase_name from abundances
df_labels['phase_name'] = df_labels['abundances'].apply(lambda x: [i['mineral_phase_name'] for i in x])
df_labels.drop(['label', '100%', 'dup_label'], axis=1, inplace=True)

# convert to int
df_labels['wavelength'] = df_labels['wavelength'].apply(lambda x: [int(i) for i in x])

df_labels.head()

#samples with labels (abundances): 151
#samples with 100% of abundances: 140
#samples after removed duplicated mineral_phase_name: 139


Unnamed: 0,wavelength,reflectance,error,file,folder,abundances,abundances_len,phase_name
0,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.2734, 0.27788, 0.28235, 0.28682, 0.29577, 0...","[0.0518, -1.0, -1.0, -1.0, 0.03897, -1.0, -1.0...",c1ag41.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,"[Olivine, Orthopyroxene, Clinopyroxene]"
1,"[300, 301, 302, 303, 305, 306, 307, 308, 310, ...","[0.118, 0.11835, 0.11871, 0.11906, 0.11978, 0....","[0.02806, -1.0, -1.0, -1.0, 0.02179, -1.0, -1....",c1kc09.json,learning,"[{'mineral_phase_name': 'Clinopyroxene', 'perc...",2,"[Clinopyroxene, Glass]"
2,"[300, 301, 302, 303, 305, 306, 307, 308, 310, ...","[0.05399, 0.05409, 0.0542, 0.0543, 0.05452, 0....","[0.07406, -1.0, -1.0, -1.0, 0.05048, -1.0, -1....",c1dd02.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",2,"[Olivine, Basalt]"
3,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.25872, 0.26341, 0.2681, 0.27279, 0.28218, 0...","[0.05129, -1.0, -1.0, -1.0, 0.0387, -1.0, -1.0...",c1ag40.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,"[Olivine, Orthopyroxene, Clinopyroxene]"
4,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.28346, 0.2875, 0.29154, 0.29558, 0.30366, 0...","[0.04664, -1.0, -1.0, -1.0, 0.03789, -1.0, -1....",c1ag31.json,learning,"[{'mineral_phase_name': 'Plagioclase', 'percen...",3,"[Plagioclase, Orthopyroxene, Clinopyroxene]"


In [45]:
df_labels['folder'].value_counts()

folder
learning    90
slab        49
Name: count, dtype: int64

### Interpolate

In [46]:
# use fitpack2 method
def interpolate_spline(x, y, xi):
    ius = InterpolatedUnivariateSpline(x, y)
    return ius(xi)


def interpolate_rbf(x, y, xi):
    a = np.array(x).reshape(-1, 1)
    b = np.array(y).reshape(-1, 1)
    rbf = RBFInterpolator(a, b)
    fi = rbf(np.array(xi).reshape(-1, 1))
    fi = list(itertools.chain.from_iterable(fi.tolist()))
    return fi

def plot_traces(x, y, xi, yi):
    fig = go.Figure()
    
    # Add traces
    fig.add_trace(go.Scatter(x=x, y=y,
                             mode='markers',
                             name='original data',
                             opacity=0.5,
                            ))
    fig.add_trace(go.Scatter(x=xi, y=yi,
                             mode='markers',
                             name='interpolated data',
                             opacity=0.5,
                        # mode='lines+markers',
                        # name='lines+markers'
                            ))
    
    fig.show()

## Apply Interpolation

In [47]:
# wavelength from 410 to 2500 nm
range_start = 410
range_end = 2501
xi = list(range(range_start, range_end)) # apply for all

df_labels['interpolated'] = [interpolate_rbf(x, y, xi) for x, y in zip(df_labels['wavelength'], df_labels['reflectance'])]
df_labels.head()

Unnamed: 0,wavelength,reflectance,error,file,folder,abundances,abundances_len,phase_name,interpolated
0,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.2734, 0.27788, 0.28235, 0.28682, 0.29577, 0...","[0.0518, -1.0, -1.0, -1.0, 0.03897, -1.0, -1.0...",c1ag41.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,"[Olivine, Orthopyroxene, Clinopyroxene]","[0.5738999999957741, 0.5751399999899149, 0.576..."
1,"[300, 301, 302, 303, 305, 306, 307, 308, 310, ...","[0.118, 0.11835, 0.11871, 0.11906, 0.11978, 0....","[0.02806, -1.0, -1.0, -1.0, 0.02179, -1.0, -1....",c1kc09.json,learning,"[{'mineral_phase_name': 'Clinopyroxene', 'perc...",2,"[Clinopyroxene, Glass]","[0.2851299999947514, 0.2861999999950058, 0.287..."
2,"[300, 301, 302, 303, 305, 306, 307, 308, 310, ...","[0.05399, 0.05409, 0.0542, 0.0543, 0.05452, 0....","[0.07406, -1.0, -1.0, -1.0, 0.05048, -1.0, -1....",c1dd02.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",2,"[Olivine, Basalt]","[0.27548999998543877, 0.27790000000823056, 0.2..."
3,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.25872, 0.26341, 0.2681, 0.27279, 0.28218, 0...","[0.05129, -1.0, -1.0, -1.0, 0.0387, -1.0, -1.0...",c1ag40.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,"[Olivine, Orthopyroxene, Clinopyroxene]","[0.5480399999996735, 0.5491900000033638, 0.550..."
4,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.28346, 0.2875, 0.29154, 0.29558, 0.30366, 0...","[0.04664, -1.0, -1.0, -1.0, 0.03789, -1.0, -1....",c1ag31.json,learning,"[{'mineral_phase_name': 'Plagioclase', 'percen...",3,"[Plagioclase, Orthopyroxene, Clinopyroxene]","[0.5339100000019243, 0.5347599999950035, 0.535..."


## Labels

In [48]:
# assign each lable with 
label_cols = sorted(list(set(itertools.chain.from_iterable(df_labels['phase_name'].tolist()))))
label_cols

['Basalt',
 'Clinopyroxene',
 'Glass',
 'Olivine',
 'Orthopyroxene',
 'Plagioclase',
 'graphite']

In [49]:
df_labels[label_cols] = 0
for i, values in df_labels['abundances'].items():
    for value in values:
        df_labels.loc[i, value['mineral_phase_name']] = value['percentage']

  df_labels.loc[i, value['mineral_phase_name']] = value['percentage']
  df_labels.loc[i, value['mineral_phase_name']] = value['percentage']
  df_labels.loc[i, value['mineral_phase_name']] = value['percentage']
  df_labels.loc[i, value['mineral_phase_name']] = value['percentage']


In [50]:
df_labels.head()

Unnamed: 0,wavelength,reflectance,error,file,folder,abundances,abundances_len,phase_name,interpolated,Basalt,Clinopyroxene,Glass,Olivine,Orthopyroxene,Plagioclase,graphite
0,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.2734, 0.27788, 0.28235, 0.28682, 0.29577, 0...","[0.0518, -1.0, -1.0, -1.0, 0.03897, -1.0, -1.0...",c1ag41.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,"[Olivine, Orthopyroxene, Clinopyroxene]","[0.5738999999957741, 0.5751399999899149, 0.576...",0,10.0,0.0,80.0,10.0,0,0
1,"[300, 301, 302, 303, 305, 306, 307, 308, 310, ...","[0.118, 0.11835, 0.11871, 0.11906, 0.11978, 0....","[0.02806, -1.0, -1.0, -1.0, 0.02179, -1.0, -1....",c1kc09.json,learning,"[{'mineral_phase_name': 'Clinopyroxene', 'perc...",2,"[Clinopyroxene, Glass]","[0.2851299999947514, 0.2861999999950058, 0.287...",0,90.0,10.0,0.0,0.0,0,0
2,"[300, 301, 302, 303, 305, 306, 307, 308, 310, ...","[0.05399, 0.05409, 0.0542, 0.0543, 0.05452, 0....","[0.07406, -1.0, -1.0, -1.0, 0.05048, -1.0, -1....",c1dd02.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",2,"[Olivine, Basalt]","[0.27548999998543877, 0.27790000000823056, 0.2...",90,0.0,0.0,10.0,0.0,0,0
3,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.25872, 0.26341, 0.2681, 0.27279, 0.28218, 0...","[0.05129, -1.0, -1.0, -1.0, 0.0387, -1.0, -1.0...",c1ag40.json,learning,"[{'mineral_phase_name': 'Olivine', 'percentage...",3,"[Olivine, Orthopyroxene, Clinopyroxene]","[0.5480399999996735, 0.5491900000033638, 0.550...",0,15.0,0.0,70.0,15.0,0,0
4,"[320, 321, 322, 323, 325, 326, 327, 328, 330, ...","[0.28346, 0.2875, 0.29154, 0.29558, 0.30366, 0...","[0.04664, -1.0, -1.0, -1.0, 0.03789, -1.0, -1....",c1ag31.json,learning,"[{'mineral_phase_name': 'Plagioclase', 'percen...",3,"[Plagioclase, Orthopyroxene, Clinopyroxene]","[0.5339100000019243, 0.5347599999950035, 0.535...",0,25.0,0.0,0.0,25.0,50,0


In [51]:
# df_labels[['wavelength', 'reflectance', 'interpolated']+label_cols].to_csv('output/training.csv', index=None)

### Apply PCA

In [52]:
n_components = 20

X = df_labels['interpolated'].tolist()
pca = PCA(n_components=n_components)
X_new = pca.fit_transform(X)
# print(pca.explained_variance_ratio_)
# print(pca.singular_values_)

# Regressor

## Random Forest

In [53]:
y = df_labels[label_cols].values

X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=.2, random_state=4
)
max_depth = 30
regr_multirf = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0)
)
regr_multirf.fit(X_train, y_train)

regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=2)
regr_rf.fit(X_train, y_train)

# Predict on new data
y_multirf = regr_multirf.predict(X_test)
y_rf = regr_rf.predict(X_test)

# XGB

In [54]:
regr_multixgb = MultiOutputRegressor(
    XGBRegressor(n_estimators=1000, max_depth=max_depth, eta=0.1, subsample=0.7, colsample_bytree=0.8))
regr_multixgb.fit(X_train, y_train)

regr_xgb = XGBRegressor(n_estimators=1000, max_depth=max_depth, eta=0.1, subsample=0.7, colsample_bytree=0.8)
regr_xgb.fit(X_train, y_train)

y_multixgb = regr_multixgb.predict(X_test)
y_xgb = regr_xgb.predict(X_test)

In [55]:
y_multirf.sum(axis=1)

array([ 97.7   ,  97.3325, 114.925 ,  80.99  , 104.27  ,  96.56  ,
        96.07  , 163.735 , 103.595 , 142.245 ,  81.6425,  82.735 ,
        90.8725, 122.045 ,  93.3375, 127.85  , 142.455 , 110.235 ,
        83.4425,  93.345 , 111.8525,  89.6575, 110.275 ,  99.765 ,
        99.59  , 103.74  ,  80.145 , 138.96  ])

### random forest gives sum of 100

In [56]:
y_rf.sum(axis=1)

array([100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100.])

In [57]:
y_multixgb.sum(axis=1)

array([ 96.90204 , 103.2953  , 101.68019 , 105.0621  ,  95.272766,
        91.33167 , 104.14286 , 125.63564 , 108.746315, 126.14473 ,
        69.416245,  95.13107 , 111.10262 , 116.89423 , 110.3807  ,
        98.06684 , 132.84395 , 103.12186 , 100.3277  , 107.896   ,
       107.48399 ,  86.25944 , 110.032135,  96.82605 , 108.492256,
       102.08181 ,  83.143845, 103.032906], dtype=float32)

In [58]:
y_xgb.sum(axis=1)

array([100.34336 , 106.20136 ,  98.2851  , 105.194275, 101.95519 ,
        80.454185,  98.462204, 125.88246 , 105.60967 , 121.85099 ,
        95.368095,  92.31808 , 113.44143 , 117.51532 ,  96.05436 ,
       122.87503 , 128.347   , 112.73127 , 102.69262 ,  98.674416,
        96.12115 ,  83.24479 ,  87.836815,  92.53829 ,  90.47205 ,
       108.71725 ,  92.33838 ,  94.83307 ], dtype=float32)

# Read the results

In [59]:
reg_results = {
    'y_multirf': y_multirf,
    'y_rf': y_rf,
    'y_multixgb': y_multixgb,
    'y_xgb': y_xgb    
}
regressor_results = {}
for i, v in reg_results.items():
    regressor_results[i] = [mean_squared_error(y_test, v, squared=False), mean_squared_error(y_test, v, squared=True)]
regressor_results = pd.DataFrame.from_dict(regressor_results, orient='index')
regressor_results.columns = ['RMSE', 'MSE']
regressor_results

Unnamed: 0,RMSE,MSE
y_multirf,13.177156,237.616369
y_rf,13.034343,235.714566
y_multixgb,11.718738,180.181365
y_xgb,11.836717,186.534029


#### Compare the prediction vs. true labels

In [60]:
df_y_rf = pd.DataFrame(y_rf)
df_y_rf.columns = label_cols
df_y_test = pd.DataFrame(y_test)
df_y_test.columns = label_cols
a = df_y_test.join(df_y_rf, lsuffix='_true', rsuffix='_pred')
a = a[sorted(a.columns)]
a

Unnamed: 0,Basalt_pred,Basalt_true,Clinopyroxene_pred,Clinopyroxene_true,Glass_pred,Glass_true,Olivine_pred,Olivine_true,Orthopyroxene_pred,Orthopyroxene_true,Plagioclase_pred,Plagioclase_true,graphite_pred,graphite_true
0,1.25,0.0,3.0825,3.0,0.0,0.0,51.82,69.0,20.9975,28.0,22.85,0.0,0.0,0.0
1,0.0,0.0,11.2,3.0,0.1,0.0,61.78,60.0,22.12,17.0,4.8,20.0,0.0,0.0
2,0.0,0.0,94.73,100.0,0.105,0.0,0.1,0.0,2.565,0.0,2.5,0.0,0.0,0.0
3,14.6,0.0,17.0825,0.0,0.46,2.0,35.63,0.0,14.6275,98.0,17.6,0.0,0.0,0.0
4,0.5,0.0,9.205,9.0,0.025,0.0,2.89,0.0,10.08,11.0,77.3,80.0,0.0,0.0
5,2.0,0.0,20.4175,20.0,0.225,0.0,52.185,60.0,21.5725,20.0,3.55,0.0,0.05,0.0
6,0.0,0.0,38.525,45.0,0.33,0.0,14.47,0.0,37.925,45.0,8.75,10.0,0.0,0.0
7,17.55,90.0,26.2375,0.0,0.39,0.0,25.24,10.0,18.1325,0.0,12.45,0.0,0.0,0.0
8,0.0,0.0,17.26,7.5,0.255,0.0,2.56,0.0,47.925,42.5,32.0,50.0,0.0,0.0
9,6.3,0.0,70.63,99.5,0.365,0.5,9.41,0.0,4.095,0.0,9.2,0.0,0.0,0.0


In [61]:
# a.to_csv('output/result_regr_validation_rf.csv', index=None)

In [62]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,9.981195,2.176574,-0.10879,0.84098,0.021038,-0.02331,-0.131326,-0.106172,-0.184526,-0.076542,0.007044,0.033087,0.023875,-0.008372,0.012255,-0.007052,-0.013477,-0.027509,0.025587,0.025358
1,-10.816964,-0.205457,0.00297,0.327624,-0.743266,-0.238808,0.089234,-0.155027,0.005131,-0.021129,0.002527,-0.032228,-0.039909,0.019213,0.016345,3.1e-05,-0.014617,0.005052,0.011647,-0.014025
2,4.253014,-0.972553,1.1876,-0.348923,1.285037,-0.537237,0.172801,0.284775,-0.284984,0.033431,0.088427,0.006071,-0.107668,-0.087964,-0.064629,-0.006167,0.024786,0.008294,0.0084,0.001032
3,-6.172273,3.87933,-0.556009,0.822981,0.19333,0.565928,0.677175,0.471056,0.139598,-0.076894,0.131619,0.073617,-0.11615,0.01565,-0.002522,-0.00933,-0.001244,0.028074,0.006373,-0.023799
4,2.12657,-0.21477,-1.138328,-0.971952,-0.105732,0.315472,0.219615,-0.03444,0.057801,-0.052619,-0.014769,-0.042374,0.054515,-0.047135,0.005376,-0.042005,-0.004825,0.035719,-0.002267,0.015679
5,-8.002643,-0.372637,0.663005,0.242305,0.008916,-0.280404,0.156807,-0.147991,0.007044,-0.071954,-0.004338,0.067487,-0.016911,0.005996,-0.019934,-0.009716,-0.009726,-0.012344,-0.009309,0.001789
6,-3.749187,-2.469376,0.480716,0.304969,0.6624,-0.246933,-0.02997,0.052165,0.028426,-0.060248,0.026618,0.069425,-0.024984,-0.003106,-0.024858,0.002874,-0.00127,0.00172,-0.022485,0.011306
7,-4.436859,6.131392,0.861281,2.774303,1.03262,0.7074,-0.018998,-0.083497,0.231149,-0.07491,-0.168621,-0.081589,-0.141441,-0.082128,-0.074676,-0.010106,0.080483,0.014512,-0.026179,-0.003735
8,-8.078882,-2.997522,-0.632232,0.163136,-0.347641,-0.03616,-0.46306,0.116361,0.150662,-0.015204,-0.008756,-0.015264,-0.001529,-0.021956,0.019927,0.021423,0.008619,0.004638,0.003129,0.001322
9,8.158168,1.082171,1.265564,-0.350324,0.95108,-0.172643,0.239659,0.387837,0.180137,-0.045075,-0.076578,0.115763,0.005263,0.066053,0.010034,-0.054439,0.009183,-0.023072,0.023041,0.025562


# Test data (no labels)

In [63]:
a = df[~df['label']].copy()
a['interpolated'] = [interpolate_rbf(x, y, xi) for x, y in zip(a['wavelength'], a['reflectance'])]
df_test = a['interpolated'].tolist()
# apply pca (fit from X) on test data
df_test_new = pca.transform(df_test)
df_test_new

array([[ 1.23565143e+01, -4.50467014e-01, -7.62090978e-01, ...,
        -1.85682899e-02, -8.66795483e-03,  3.69998640e-02],
       [-2.37643016e+01, -5.42706846e-01, -8.54936826e-01, ...,
         2.99102735e-02,  6.14107735e-03,  2.59230127e-02],
       [ 1.50628008e+00,  1.04967818e+00, -2.00316823e-02, ...,
         7.13608636e-03,  2.48590431e-02, -2.24718733e-02],
       ...,
       [-3.34422380e+00, -5.92817828e-01, -1.00704635e+00, ...,
         4.43894498e-02,  1.66592156e-02,  9.91350342e-03],
       [ 4.10376485e+00, -2.39347665e+00, -3.67382785e-01, ...,
         1.86871245e-02,  8.04549942e-03, -1.06735582e-02],
       [ 3.82908875e+00, -1.13336397e+00, -6.20758203e-01, ...,
         1.34659923e-02,  8.85474536e-03, -1.26728413e-04]])

In [64]:
# a.to_csv('output/test_interpolated.zip', index=None)

In [65]:
df_rf = regr_rf.predict(df_test_new)
df_rf.sum(axis=1) # sum of all prediction in one sample -> 100

array([100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 10

In [66]:
a = pd.DataFrame(df_rf)
a.columns = label_cols
# a.to_csv('output/result_regr_test_rf.csv', index=None)
a

Unnamed: 0,Basalt,Clinopyroxene,Glass,Olivine,Orthopyroxene,Plagioclase,graphite
0,0.00,53.2700,0.125,13.250,16.1550,17.20,0.00
1,1.90,31.7900,0.690,10.405,14.7150,40.45,0.05
2,11.75,26.3200,0.450,29.610,8.4200,23.45,0.00
3,24.55,26.0825,0.725,36.355,6.0875,6.15,0.05
4,8.50,32.8950,0.565,33.835,10.2550,13.90,0.05
...,...,...,...,...,...,...,...
526,8.85,29.1600,0.335,25.030,7.6750,28.95,0.00
527,16.25,31.4225,1.100,35.045,8.7325,7.40,0.05
528,0.95,12.6700,0.025,2.540,14.8150,69.00,0.00
529,0.00,44.7200,0.680,0.900,35.8000,17.90,0.00


# Classification

### convert percentage to 1/0

In [67]:
labels = df_labels[label_cols].copy()
for label in label_cols:
    labels[label] = labels[label].apply(lambda x: 1 if x>0 else 0)
labels

Unnamed: 0,Basalt,Clinopyroxene,Glass,Olivine,Orthopyroxene,Plagioclase,graphite
0,0,1,0,1,1,0,0
1,0,1,1,0,0,0,0
2,1,0,0,1,0,0,0
3,0,1,0,1,1,0,0
4,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...
677,0,1,0,0,1,1,0
679,0,1,0,1,1,1,0
680,0,0,0,1,1,1,0
681,0,1,0,0,1,0,0


## Random Forest

In [68]:
y = labels.values

# split into train and validation set = 80% - 20% = 111 samples for learning - 28 samples for evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=.2, random_state=4
)
# define arguments for the classifier 
max_depth = 30

# MultiOutputClassifier
cl_multirf = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=0)
)
cl_multirf.fit(X_train, y_train)

# RandomForestClassifier
cl_rf = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=2)
cl_rf.fit(X_train, y_train)

# Predict on validation set (28 samples)
y_multirf = cl_multirf.predict(X_test)
y_rf = cl_rf.predict(X_test)

In [69]:
y_multirf

array([[0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 1, 0],
       [0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 1, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 0, 0, 0]])

In [70]:
y_rf

array([[0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 1, 0],
       [0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 1, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0]])

In [71]:
accuracy_score(y_test, y_rf), accuracy_score(y_test, y_multirf)

(0.4642857142857143, 0.4642857142857143)

#### random forest

In [72]:
df_y_rf = pd.DataFrame(y_rf)
df_y_rf.columns = label_cols
df_y_test = pd.DataFrame(y_test)
df_y_test.columns = label_cols
a = df_y_test.join(df_y_rf, lsuffix='_true', rsuffix='_pred')
a = a[sorted(a.columns)]
# a.to_csv('output/result_cl_validation_rf.csv', index=None)
a

Unnamed: 0,Basalt_pred,Basalt_true,Clinopyroxene_pred,Clinopyroxene_true,Glass_pred,Glass_true,Olivine_pred,Olivine_true,Orthopyroxene_pred,Orthopyroxene_true,Plagioclase_pred,Plagioclase_true,graphite_pred,graphite_true
0,0,0,1,1,0,0,1,1,1,1,1,0,0,0
1,0,0,1,1,0,0,1,1,1,1,0,1,0,0
2,0,0,1,1,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,1,0,0,0,1,0,0,0,0
4,0,0,1,1,0,0,0,0,1,1,1,1,0,0
5,0,0,1,1,0,0,1,1,1,1,0,0,0,0
6,0,0,1,1,0,0,1,0,1,1,0,1,0,0
7,0,1,0,0,0,0,1,1,0,0,0,0,0,0
8,0,0,1,1,0,0,0,0,1,1,1,1,0,0
9,0,0,1,1,0,1,0,0,0,0,0,0,0,0


#### multioutput random forest

In [73]:
df_y_rf = pd.DataFrame(y_multirf)
df_y_rf.columns = label_cols
df_y_test = pd.DataFrame(y_test)
df_y_test.columns = label_cols
a = df_y_test.join(df_y_rf, lsuffix='_true', rsuffix='_pred')
a = a[sorted(a.columns)]
# a.to_csv('output/result_cl_validation_multirf.csv', index=None)
a

Unnamed: 0,Basalt_pred,Basalt_true,Clinopyroxene_pred,Clinopyroxene_true,Glass_pred,Glass_true,Olivine_pred,Olivine_true,Orthopyroxene_pred,Orthopyroxene_true,Plagioclase_pred,Plagioclase_true,graphite_pred,graphite_true
0,0,0,1,1,0,0,1,1,1,1,1,0,0,0
1,0,0,1,1,0,0,1,1,1,1,0,1,0,0
2,0,0,1,1,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,1,0,0,0,1,0,0,0,0
4,0,0,1,1,0,0,0,0,1,1,1,1,0,0
5,0,0,1,1,0,0,1,1,1,1,0,0,0,0
6,0,0,1,1,0,0,1,0,1,1,0,1,0,0
7,0,1,0,0,0,0,1,1,0,0,0,0,0,0
8,0,0,1,1,0,0,0,0,1,1,1,1,0,0
9,0,0,1,1,0,1,0,0,0,0,0,0,0,0


## Test

In [74]:
y_multirf_test = cl_multirf.predict(df_test_new)
y_rf_test = cl_rf.predict(df_test_new)

In [75]:
a = pd.DataFrame(y_multirf_test)
a.columns = label_cols
# a.to_csv('output/result_cl_test_multirf.csv', index=None)
a

Unnamed: 0,Basalt,Clinopyroxene,Glass,Olivine,Orthopyroxene,Plagioclase,graphite
0,0,0,0,0,0,0,0
1,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
526,0,0,0,0,0,0,0
527,1,0,0,1,0,0,0
528,0,1,0,0,1,1,0
529,0,1,0,0,1,1,0


In [76]:
a = pd.DataFrame(y_rf_test)
a.columns = label_cols
# a.to_csv('output/result_cl_test_rf.csv', index=None)
a

Unnamed: 0,Basalt,Clinopyroxene,Glass,Olivine,Orthopyroxene,Plagioclase,graphite
0,0,1,0,0,0,0,0
1,0,1,0,0,1,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
526,0,0,0,1,0,0,0
527,0,0,0,1,0,0,0
528,0,1,0,0,1,1,0
529,0,1,0,0,1,1,0
