In [1]:
import pandas as pd
import numpy as np
import modules
import random

from scipy.stats import beta
from modules import ml_functions
import matplotlib.pyplot as plt

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

# 1.0 Data retrieval

In [2]:
df = pd.read_excel('../../data_lake/input/CTG.xls',sheet_name='Raw Data',skiprows=[1])

In [3]:
df = df[:-3]

# 2.0 Data preparation

In [4]:
df = df.drop(columns=['FileName','Date','A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP','b','e','CLASS'])

In [5]:
df = df.set_index('SegFile')

In [6]:
for i in df:
    if df[i].nunique() == 1:
        df = df.drop(columns=i)

In [7]:
df['Tendency'] = df['Tendency'].astype(str)
df['NSP'] = df['NSP'].astype(str)

# 3.0 Modeling

In [8]:
df_ml = df.copy()

## 3.1 Classe 1 - Normale

In [9]:
class_nsp = 1.0

In [None]:
df_model, df_ml = ml_functions.model_preparation(df_ml,class_nsp)
X_train, X_test, y_train, y_test, numerical_cols, categorical_cols, parameters = ml_functions.data_splitting(df_model)
model = ml_functions.model_training(X_train, y_train, numerical_cols, categorical_cols, parameters)
model = ml_functions.model_evaluation(model,X_test,y_test)
df = ml_functions.model_serving(df,model,class_nsp,X_test)
df['proba_classe_1.0'].hist()

## 3.2 Classe 2 - Sospetti

In [11]:
class_nsp = 2.0

In [None]:
df_model, df_ml = ml_functions.model_preparation(df_ml,class_nsp)
X_train, X_test, y_train, y_test, numerical_cols, categorical_cols, parameters = ml_functions.data_splitting(df_model)
model = ml_functions.model_training(X_train, y_train, numerical_cols, categorical_cols, parameters)
model = ml_functions.model_evaluation(model,X_test,y_test)
df = ml_functions.model_serving(df,model,class_nsp,X_test)
df['proba_classe_2.0'].hist()

## 3.3 Classe 3 - Anormale

In [None]:
class_nsp = 3.0

In [None]:
df_model, df_ml = ml_functions.model_preparation(df_ml,class_nsp)
X_train, X_test, y_train, y_test, numerical_cols, categorical_cols, parameters = ml_functions.data_splitting(df_model)
model = ml_functions.model_training(X_train, y_train, numerical_cols, categorical_cols, parameters)
model = ml_functions.model_evaluation(model,X_test,y_test)
df = ml_functions.model_serving(df,model,class_nsp,X_test)
df['proba_classe_3.0'].hist()

# 4.0 Normalizing results

In [None]:
cols = ['proba_classe_1.0','proba_classe_2.0', 'proba_classe_3.0']

In [None]:
df[cols] = df[cols].apply(lambda x: x / x.sum(),axis=1)

# 5.0 Beta Inference

In [None]:
xax = np.linspace(0,1.0,100)

In [None]:
num_classe = dict()
tot = 0
for i in df['NSP'].unique():
    num = df[df['NSP']==i].shape[0]
    tot = tot + num
    num_classe[i] = dict()
    num_classe[i]['num'] = num

In [None]:
for i in num_classe:
    num_classe[i]['wei'] = num_classe[i]['num'] / tot
    num_classe[i]['k'] = num_classe[i]['num'] * num_classe[i]['wei']

In [None]:
df['proba_classe_1.0_beta'] = df['proba_classe_1.0'].apply(lambda x: beta.pdf(xax,(x * (num_classe['1.0']['k']-2)+1),((1-x) * (num_classe['1.0']['k']-2)+1)))
df['proba_classe_2.0_beta'] = df['proba_classe_2.0'].apply(lambda x: beta.pdf(xax,(x * (num_classe['2.0']['k']-2)+1),((1-x) * (num_classe['2.0']['k']-2)+1)))
df['proba_classe_3.0_beta'] = df['proba_classe_3.0'].apply(lambda x: beta.pdf(xax,(x * (num_classe['3.0']['k']-2)+1),((1-x) * (num_classe['3.0']['k']-2)+1)))

Visual check

In [None]:
index = random.randint(0,2126)

In [None]:
plt.plot(df['proba_classe_1.0_beta'][index],label='1.0')
plt.plot(df['proba_classe_2.0_beta'][index],label='2.0')
plt.plot(df['proba_classe_3.0_beta'][index],label='3.0')
plt.legend()

# 6.0 Output

In [None]:
df.sample(2)