In [None]:
import os
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

import numpy as np
from scipy.interpolate import RBFInterpolator, InterpolatedUnivariateSpline
# from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

import more_itertools as mit
import itertools

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

# pd.set_option('display.max_colwidth', None)
data_folder = 'INAF_case_data'

# 0. Read and Combine files from all folders

In [None]:
folders = ['learning', 'test', 'slab']
df = pd.DataFrame()
for folder in folders:
    learning_folder = os.path.join(data_folder, folder)
    for json_file in os.listdir(learning_folder):
        # print(json_file)
        file_name = os.path.join(learning_folder, json_file)
        with open(file_name) as f:
            content = json.load(f)
            a = pd.DataFrame(content.items(), columns=['key', 'values'])
            b = a[a['key']=='spectrum']['values']
            b = b.apply(pd.Series)
            b['file'] = json_file
            b['folder'] = folder

            c = a[a['key']=='abundances']['values']
            if not c.empty:
                if c.values:
                    b['abundances'] = c.values
            df = pd.concat([df, b], ignore_index=True)

df['abundances_len'] = df['abundances'].apply(lambda x: len(x) if isinstance(x, list) else 0)
df['label'] = df['abundances_len']>0

In [None]:
df.head()

In [None]:
df['file'].nunique()

In [None]:
cols = ['folder', 'file', 'wavelength', 'reflectance', 'error', 'abundances']
df[cols]

In [None]:
content.keys()

In [None]:
content['spectrum'].keys()

# 1. Labels

## 1.1.

In [None]:
a = df.groupby('folder')['label'].value_counts().reset_index()
fig = px.bar(a, x='folder', y='count', color='label', barmode='group', text_auto=True,
             title='#labels in each folder', width=400
            )
fig.show()

In [None]:
px.histogram(df, x='abundances_len', color='folder', text_auto=True,
             width=550, height=550,
             title='Length of Abundances'
            )

In [None]:
df[~df['label']].shape, df[df['label']].shape

## 1.2

In [None]:
# check sum of % if 100
a = df[(df['abundances_len']>0) & (7>df['abundances_len'])].copy() #151 files
a['100%'] = a['abundances'].apply(lambda x: sum(i['percentage'] for i in x)==100)
b = a[~a['100%']] # 11 files
b.head()

In [None]:
b[['folder', 'file', 'abundances']]

## 1.3. Remove wrong labels

In [None]:
# duplicated phase name inside each sample?
# eliminate above 11 files from next check
c = a[a['100%']].copy() # 140 files
c['dup_label'] = c['abundances'].apply(lambda x: len(x)!=len(set(i['mineral_phase_name'] for i in x))
                         )
tmp = c[c['dup_label']][['folder', 'file', 'abundances']]
tmp
# for i, j in zip(c['abundances'], [])

In [None]:
tmp['abundances'].tolist()

## 1.4. Filter Training Set (139 files)

In [None]:
# check how many phases in total and what they are
d = c[~c['dup_label']].copy() # 139 files
d['phase_name'] = d['abundances'].apply(lambda x: [i['mineral_phase_name'] for i in x])
d.head()

In [None]:
import itertools
e = pd.Series(itertools.chain.from_iterable(d['phase_name'].tolist())).value_counts()
px.bar(e, width=600, title='mineral_name_phase counts', text_auto=True)

In [None]:
d['combined_name'] = d['phase_name'].apply(lambda x: '_'.join(sorted(x)))
e = d['combined_name'].value_counts()
fig = px.bar(e, width=700, 
       title='combined name counts', text_auto=True,
       height=450,
      )
fig. update(layout_showlegend=False)
fig.show()

In [None]:
e = d[['combined_name', 'folder']].value_counts().reset_index()
fig = px.bar(e, width=700, x='combined_name', y='count',
       title='combined name counts', text_auto=True, color='folder',
       height=450,
      )
# fig.update_layout(showlegend=False)
fig.show()

### Plot the labels

In [None]:
labels = d['combined_name'].unique()
color_list = px.colors.qualitative.Dark24

fig = go.Figure()
for idx, label in enumerate(labels):
    
    df_label = d[d['combined_name']==label]
    cnt = 0
    showlegend = True
    for i, row in df_label.iterrows():
        x = row['wavelength']
        y = row['reflectance']
        if cnt > 0:
            showlegend = False
        fig.add_trace(go.Line(x=x, y=y, marker_color=color_list[idx],
                              legendgroup=label,
                              showlegend=showlegend,
                              name=row['combined_name'],
                              
                             )
                     )
        cnt += 1

fig.update_xaxes(range=[280, 2750], title='wavelength')
fig.update_yaxes(range=[0, 1], title='reflectance')
fig.update_layout(height=800)
fig.show()

In [None]:
df['wavelength'] = df['wavelength'].apply(lambda x: [int(i) for i in x])
df['missed_wavelength'] = [sorted(list(set(range(280, 2701))- set(i))) if j=='learning' else sorted(list(set(range(350, 2501))- set(i)))
                          for i, j in zip(df['wavelength'], df['folder'])]
df['missed_wavelength_len'] = df['missed_wavelength'].apply(len)

df['missed_wavelength_range'] = df['missed_wavelength'].apply(lambda a: 
                                                             [[min(i), max(i)] if len(i)>2 else i for i in [list(group) for group in mit.consecutive_groups(a)]])
df['missed_wavelength_range_len'] = df['missed_wavelength_range'].apply(len)
df['missed_wavelength_range_len2'] = df['missed_wavelength_range'].apply(lambda x: [i for i in x if len(i)>1])
df['missed_wavelength_range_len2_str'] = df['missed_wavelength_range_len2'].apply(lambda x: '|'.join([f'{str(i[0])}_{str(i[1])}' for i in x]))
df['missed_wavelength_range_len2_cnt'] = df['missed_wavelength_range'].apply(lambda x: len([i for i in x if len(i)>1]))
df.head()

In [None]:
a = df[df['missed_wavelength_range_len2_str']!=''].copy()
a = a.groupby('missed_wavelength_range_len2_str')['folder'].value_counts().reset_index()
a = a.sort_values('missed_wavelength_range_len2_str')
px.bar(a, x='missed_wavelength_range_len2_str', y='count', color='folder',
       width=1000, text_auto=True,
       title='Count of missing ranges of wavelength'
      )

In [None]:
df['missed_wavelength_len'].value_counts()

In [None]:
missed_wavelength_range_len2_cnt = df['missed_wavelength_range_len2_cnt'].value_counts()
missed_wavelength_range_len2_cnt

In [None]:
for cnt in sorted(missed_wavelength_range_len2_cnt.index):
    print(f'{cnt} range(s) in missing wavelengths')
    print(df[df['missed_wavelength_range_len2_cnt']==cnt]['missed_wavelength_len'].value_counts())
    print('========================')

In [None]:
range_list = {}
for i in df['missed_wavelength_range_len2']:
    for j in i:
        name = '_'.join([str(item) for item in j])
        # if name not in range_list:
        range_list[name] = range_list.get(name, 0) + 1
range_list = pd.DataFrame.from_dict(range_list, orient='index')
range_list.columns = ['count']
range_list = range_list.sort_values('count', ascending=False)
range_list.reset_index(inplace=True)
range_list['#missing'] = range_list['index'].apply(lambda x: int(x.split('_')[1]) - int(x.split('_')[0])+1)
range_list = range_list.sort_values('#missing', ascending=False).reset_index(drop=True)
range_list

In [None]:
labels = d['combined_name'].unique()
x_learning = range(280, 2701)
x_slab = range(350, 2501)

for idx, label in enumerate(labels):
    fig = go.Figure()    
    df_label = d[d['combined_name']==label]
    for i, row in df_label.iterrows():
        x = row['wavelength']
        y = row['reflectance']
        fig.add_trace(go.Line(x=x, y=y, marker_color='DarkSlateGrey',
                              showlegend=False,
                             )
                     )
    fig.update_layout(title=label, width=800)
    fig.update_xaxes(range=[280, 2750], title='wavelength')
    fig.update_yaxes(range=[0, 1], title='reflectance')
    fig.show()

# 2. Spectrum - Wavelength

In [None]:
d['folder'].value_counts()

In [None]:
d.dtypes

In [None]:
import itertools
e = pd.Series(itertools.chain.from_iterable(d['wavelength'].tolist())).value_counts()
e.index = e.index.astype(int)
px.bar(e, #width=600, 
       title='wavelengths counts', 
       text_auto=True)

In [None]:
e.index.min(), e.index.max()

In [None]:
2700-279-2394

In [None]:
e.sort_index()

In [None]:
e.value_counts().sort_index()

In [None]:
f = df[~df['label']].copy()
f.shape

In [None]:
import itertools
e = pd.Series(itertools.chain.from_iterable(f['wavelength'].tolist())).value_counts()
e.index = e.index.astype(int)
px.bar(e, #width=600, 
       title='wavelengths counts in unlabeled sets',
       text_auto=True)

In [None]:
e.index.min(), e.index.max()

In [None]:
e.sort_index()

In [None]:
f['folder'].value_counts()

# 3. reflectance

In [None]:
d.columns

In [None]:
e = pd.Series(itertools.chain.from_iterable(d['reflectance'].tolist()))
e.min(), e.max()

In [None]:
e = pd.Series(itertools.chain.from_iterable(f['reflectance'].tolist()))
e.min(), e.max()

In [None]:
df['wave_refl_len'] = df['wavelength'].apply(len)==df['reflectance'].apply(len)
df[~df['wave_refl_len']]

In [None]:
# length of wavelengths and reflectance?
df.wavelength.apply(len)

In [None]:
df.groupby('folder')['wavelength'].apply(len)

# 4. Error

In [None]:
df['wavelength_len'] = df['wavelength'].apply(len)
df['reflectance_len'] = df['reflectance'].apply(len)
df['error_len'] = df['error'].apply(len)
df

In [None]:
df[(df['wavelength_len']!=df['reflectance_len']) | (df['wavelength_len']!=df['error_len']) | (df['reflectance_len']!=df['error_len'])]

In [None]:
a = df.groupby(['folder', 'wavelength_len'])['error_len'].count()#.reset_index()
a

In [None]:
b = a.reset_index()
b = b.rename(columns={'error_len': 'count'})
b = b.sort_values('wavelength_len')
b['wavelength_len'] = b['wavelength_len'].astype(str)
b

In [None]:
fig = px.bar(b, x='wavelength_len', y='count', color='folder', #barmode='group'
             text_auto=True, height=900, width=800,
             title='count of wavelength in each dataset'
            )
fig.show()

# 5. Wavelength vs. Reflectance

In [None]:
df_scatter = pd.DataFrame()
for folder in ['learning', 'slab', 'test']:
    a = pd.Series(itertools.chain.from_iterable(df[df['folder']==folder]['wavelength']))
    b = pd.Series(itertools.chain.from_iterable(df[df['folder']==folder]['reflectance']))
    test = pd.concat([a, b], axis=1)
    test.columns = ['wavelength', 'reflectance']
    test['folder'] = folder
    df_scatter = pd.concat([df_scatter, test])
                                              
df_scatter

In [None]:
df_scatter.groupby('folder').agg(['min', 'max'])

In [None]:
fig = px.scatter(df_scatter, x='wavelength', y='reflectance', color='folder',
                 opacity=0.5
                )
fig.update_traces(marker=dict(size=.8,
                              # line=dict(width=2,
                              #           color='DarkSlateGrey'
                                       )
                             ),
fig.show()

In [None]:
folder = 'test'
data_plot = df_scatter[df_scatter['folder']==folder]
fig = px.scatter(data_plot, x='wavelength', y='reflectance', color='folder', title=folder)
fig.update_layout(yaxis_range=[0, 1], xaxis_range=[280, 2750])
fig.update_traces(marker=dict(size=.8))
fig.show()

In [None]:
folder = 'learning'
data_plot = df_scatter[df_scatter['folder']==folder]
fig = px.scatter(data_plot, x='wavelength', y='reflectance', color='folder', title=folder)
fig.update_layout(yaxis_range=[0, 1], xaxis_range=[280, 2750])
fig.update_traces(marker=dict(size=.8))
fig.show()

In [None]:
folder = 'slab'
data_plot = df_scatter[df_scatter['folder']==folder]
fig = px.scatter(data_plot, x='wavelength', y='reflectance', color='folder', title=folder)
fig.update_layout(yaxis_range=[0, 1], xaxis_range=[280, 2750])
fig.update_traces(marker=dict(size=.8))
fig.show()

# Data Transformation

In [None]:
d.drop(['label', '100%', 'dup_label'], axis=1, inplace=True)
d

## Features

In [None]:
d['wavelength_len'] = d['wavelength'].apply(len)
d

In [None]:
# duplicate wavelength in each sample?
d['wavelength_len_2'] = d['wavelength'].apply(lambda x: len(set(x)))
d[d['wavelength_len_2']!=d['wavelength_len']]

In [None]:
# convert to int
d['wavelength'] = d['wavelength'].apply(lambda x: [int(i) for i in x])
d.head()

In [None]:
d['missed_wavelength'] = [sorted(list(set(range(280, 2701))- set(i))) if j=='learning' else sorted(list(set(range(350, 2501))- set(i)))
                          for i, j in zip(d['wavelength'], d['folder'])]
d['missed_wavelength_len'] = d['missed_wavelength'].apply(len)


a = d.groupby('folder')['missed_wavelength_len'].value_counts().reset_index()
a['missed_wavelength_len'] = a['missed_wavelength_len'].astype(str)
a.sort_values('missed_wavelength_len', inplace=True)
px.bar(a, x='missed_wavelength_len', y='count', color='folder', text_auto=True, width=650)

In [None]:
a

In [None]:
missing_cnt = a[a['count']>1]['missed_wavelength_len'].values[1:]
missing_list = {}
for missing in missing_cnt:
    tmp = d[d['missed_wavelength_len'].astype(str)==missing]['missed_wavelength']
    excluded = set(tmp.iloc[0])
    for i in tmp[1:]:
        excluded -= set(i)
    missing_list[missing] = excluded
missing_list

In [None]:
df['wavelength'] = df['wavelength'].apply(lambda x: [int(i) for i in x])
df['missed_wavelength'] = [sorted(list(set(range(280, 2701))- set(i))) if j=='learning' else sorted(list(set(range(350, 2501))- set(i)))
                          for i, j in zip(df['wavelength'], df['folder'])]
df['missed_wavelength_len'] = df['missed_wavelength'].apply(len)


a = df.groupby('folder')['missed_wavelength_len'].value_counts().reset_index()
a.sort_values('missed_wavelength_len', inplace=True)
a['missed_wavelength_len'] = a['missed_wavelength_len'].astype(str)

px.bar(a, x='missed_wavelength_len', y='count', color='folder', 
       text_auto=True, width=650)

In [None]:
missing_cnt = a[a['count']>1]['missed_wavelength_len'].values[1:]
missing_list = {}
for missing in missing_cnt:
    tmp = df[df['missed_wavelength_len'].astype(str)==missing]['missed_wavelength']
    excluded = set(tmp.iloc[0])
    for i in tmp[1:]:
        excluded -= set(i)
    missing_list[missing] = excluded
missing_list

In [None]:
# train set
a = pd.Series(itertools.chain.from_iterable(d['missed_wavelength'].values)).value_counts()#.sort_index()
b = a.copy().reset_index()
b.sort_values(['count', 'index'], inplace=True)
b.set_index('index', inplace=True)
b.index = b.index.astype(str)
b.index.name = 'missing_wavelength'
px.scatter(b)

In [None]:
# b.to_csv('output/missing_wavelengths_train.csv')

In [None]:
# test set
test = df[~df['label']]
a = pd.Series(itertools.chain.from_iterable(test['missed_wavelength'].values)).value_counts()#.sort_index()
b = a.copy().reset_index()
b.sort_values(['count', 'index'], inplace=True)
b.set_index('index', inplace=True)
b.index = b.index.astype(str)
b.index.name = 'missing_wavelength'
px.scatter(b)

In [None]:
# b.to_csv('output/missing_wavelengths_test.csv')

In [None]:
a = pd.Series(itertools.chain.from_iterable(df['missed_wavelength'].values)).value_counts()#.sort_index()
b = a.copy().reset_index()
b.sort_values(['count', 'index'], inplace=True)
b.set_index('index', inplace=True)
b.index = b.index.astype(str)
b.index.name = 'missing_wavelength'
px.scatter(b)

In [None]:
# b.to_csv('output/missing_wavelengths.csv')

In [None]:
b = a.sort_index()
b.index.name = 'missing_wavelength'
px.bar(b)

In [None]:
a.idxmin(), a.idxmax()

In [None]:
len(missing_list['660'])

In [None]:
a = d['missed_wavelength_len'].value_counts().sort_index()
a.index = a.index.astype(str)
px.bar(a, text_auto=True, width=650)