In [None]:
import os
import sys
import pandas as pd

while os.path.basename(os.getcwd()) != "group_8": os.chdir("..")
project_dir = os.getcwd()
sys.path.append(os.path.join(project_dir, "src"))

from utils.data_preparation import (
    preprocess_data,
    data_connection,
    find_patho_genes_df,
    binarizee,
    selected_genes
)
from utils.statistical_tests import (
    matrix_fisher,
    matrix_chi2,
    create_model_matrix
)
from utils.visualisation import generate_heatmap
from models.model_teeth import (
    prepare_data_for_model_teeth,
    train_test_random_forest_with_undersampling
)
from models.model_cleft import XGBoost_model

input_path=(os.path.abspath(os.path.join(project_dir,"..", 'local_files')))
output_path=input_path
data_connection(input_path, output_path)
save_dir = os.path.join(project_dir, 'src', 'visualisation')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv(os.path.abspath(os.path.join(project_dir, "..", 'local_files', 'genetics.csv')))

In [None]:
df = preprocess_data(df)
all_filters_df = find_patho_genes_df(df, MHD_min3=True, freq_threshold=0.05, silent_mutation=False, malicious=True)

In [None]:
selected_genes_teeth = selected_genes(all_filters_df, df, illness="teeth")
selected_genes_cleft = selected_genes(all_filters_df, df, illness="cleft")

In [None]:
generate_heatmap(all_filters_df, df, selected_genes_teeth, selected_genes_cleft, save_dir=save_dir, illness="teeth")
generate_heatmap(all_filters_df, df, selected_genes_teeth, selected_genes_cleft, save_dir=save_dir, illness="cleft")

In [None]:
model_matrix_teeth = create_model_matrix(all_filters_df, df, selected_genes_teeth, selected_genes_cleft, illness="teeth")
model_matrix_cleft = create_model_matrix(all_filters_df, df, selected_genes_teeth, selected_genes_cleft, illness="cleft")

In [None]:
binmatrix_teeth = binarizee(all_filters_df, df, illness = "teeth")
binmatrix_cleft = binarizee(all_filters_df, df, illness = "cleft")

above_0_matrix_teeth = binmatrix_teeth[(binmatrix_teeth['control_with_pathogenic']>0)&(binmatrix_teeth['teeth_with_pathogenic']>0)]
above_0_matrix_cleft = binmatrix_cleft[(binmatrix_cleft['control_with_pathogenic']>0)&(binmatrix_cleft['cleft_with_pathogenic']>0)]

matrix_chi2_teeth_df = matrix_chi2(above_0_matrix_teeth, target_name="braki zębowe" )
matrix_chi2_cleft_df = matrix_chi2(above_0_matrix_cleft, target_name="rozszczepy" )
matrix_fisher_teeth_df = matrix_fisher(binmatrix_teeth, target_name="braki zębowe", method = 'hommel' )
matrix_fisher_cleft_df = matrix_fisher(binmatrix_cleft, target_name="rozszczepy", method = 'hommel' )

filtered_chi2_teeth = matrix_chi2_teeth_df.loc[matrix_chi2_teeth_df['teeth_chi2_p-value_0.05'] < 0.05, ['Gene_Name']]
filtered_chi2_cleft = matrix_chi2_cleft_df.loc[matrix_chi2_cleft_df['cleft_chi2_p-value_0.05'] < 0.05, ['Gene_Name']]
filtered_fisher_cleft = matrix_fisher_cleft_df.loc[matrix_fisher_cleft_df['cleft_fisher_p-value_0.05'] < 0.05, ['Gene_Name']]
filtered_fisher_teeth = matrix_fisher_teeth_df.loc[matrix_fisher_teeth_df['teeth_fisher_p-value_0.05'] < 0.05, ['Gene_Name']]

In [None]:
teeth_base = prepare_data_for_model_teeth(df)
train_test_random_forest_with_undersampling(teeth_base, save_dir,random_seed=42)

In [None]:
XGBoost_model(model_matrix_cleft, save_dir)