In [1]:
# Импорт необходимых библиотек
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import joblib

from functions import plot_text, normalize_target, check_chi2, barplot_group, fullfeat, check_overfitting, \
plot_confusion_matrix, duomatrix, get_metrics, replace_values
from get_metrics import get_metrics_classification
from sklearn.metrics import precision_score, recall_score, mean_absolute_error, accuracy_score, \
f1_score, log_loss, roc_curve, auc, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_predict

import yaml
from yaml import load
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.inspection import permutation_importance

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore")
simplefilter("ignore", category=RuntimeWarning)

import optuna
from optuna.integration import LightGBMPruningCallback
from optuna.visualization import plot_optimization_history

from tqdm import tqdm_notebook

In [2]:
def replace_values(data: pd.DataFrame, map_change_columns: dict) -> pd.DataFrame:
    """
    Замена значений в датасете
    :param data: датасет
    :param map_change_columns: словарь с признаками и значениями
    :return: датасет
    """
    # Переименуем колонку Female и значения
    data = data.rename(columns={'Female': 'Gender'})
    return data.replace(map_change_columns)

def binar(data: pd.DataFrame):
    """
    Функция бинаризации, при необходимости
    data: датасет
    """
    data = pd.get_dummies(data)
    return data

def check_columns_evaluate(data: pd.DataFrame, unique_values_path: str, **kwargs) -> pd.DataFrame:
    """
    Проверка на наличие признаков из train и упорядочивание признаков согласно train
    :param data: датасет test
    :param unique_values_path: путь до списока с признаками train для сравнения
    :return: датасет test
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)
    
    with open(config_path) as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    train_config = config["train"]
        
    column_sequence = unique_values.keys()
    #  assert set(column_sequence) == set(data.columns), column_sequence
    if set(column_sequence) != set(data.columns):
        for j in list(column_sequence):
            if j not in list(data):
                data[j] = 0 
              
    return data[column_sequence] 

# Import

In [3]:
config_path = '/home/basil/Downloads/mlops/config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']


# check columns with train
column_sequence_path = preproc['input_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)

In [4]:
data = pd.read_csv(evaluate['predict_path'])

# Preprocessing 

In [5]:
def pipeline_preprocess(data: pd.DataFrame, flg_evaluate: bool = True, **kwargs):
    """
    Пайплайн по предобработке данных
    :param data: датасет
    :param flg_evaluate: флаг для evaluate
    :return: датасет
    """
    # get params
    with open(config_path) as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    train_config = config["train"]
    
    # значения для добавления признаков при бинаризации и если поданы не все признаки
    unique_values_path=kwargs["unique_values_path"]
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)
        
    # значения для ввода в UI
    input_path=kwargs["input_path"]
    with open(input_path) as json_file:
        input_values = json.load(json_file)
    
    try: # исключение ошибки ручного ввода, в который подются только необходимые признаки
       data = data.drop(kwargs["drop_columns"], axis=1)
    except:
        pass

    if flg_evaluate:
        pass
    else:
        save_input_data(
            data=data,
            drop_columns=kwargs["drop_columns"],
            target_column=kwargs["target_column"],
            input_path=kwargs["input_path"],
        )
    
    # transform values
    data = replace_values(data=data, map_change_columns=kwargs["map_change_columns"])
    
    # Часть пропусков заполним нулями
    data['Tenure'] = np.where(data['Status'] == 'studying', 0, data.Tenure)
    data['Tenure'] = np.where(data.Status == 'other', 0, data.Tenure)
    
    # Остальные в признаке заполняем модой
    data['Tenure'] = data.Tenure.fillna(data.Tenure.mode())
    
    # Логарифмируем, приводим распределение в более нормальное
    data['Birthyear'] = np.log(data['Birthyear'] + 1)
    data['Tenure'] = np.log(data['Tenure'])
    data.loc[data['Tenure'] < 0, 'Tenure'] = 0.0
    data['Tenure'] = data.Tenure.fillna(data.Tenure.mode()[0])
    
    # Заполним пропуски в остальных признаках значением 'None', т.к. они либо категориальные, либо бинарные.
    data = data.fillna('None')
    
    # Биниарзуем
    data = binar(data)
    
    # проверка dataset на совпадение с признаками из train
    # либо сохранение уникальных данных с признаками из train
    if flg_evaluate:
        data = check_columns_evaluate(
            data=data, unique_values_path=kwargs["unique_values_path"]
        )
    else:
        save_unique_train_data(
            data=data,
            drop_columns=kwargs["drop_columns"],
            target_column=kwargs["target_column"],
            unique_values_path=kwargs["unique_values_path"],
        )

   # закомменчен код для категоризации признаков, не стала удалять
   # change category types
   # dict_category = {key: "category" for key in data.select_dtypes(["object"]).columns}
   # data = transform_types(data=data, change_type_columns=dict_category)
    
    return data

In [6]:
#Подготовим загружаемый датасет к предсказанию
eval_data = pipeline_preprocess(data=data, flg_evaluate=True, **preproc)

In [7]:
# Импортируем для демо-проекта бейзлайн (для экономии времени)
model = joblib.load(training['model_path'])

# Добавим предсказания для каждой строки в исходный датасет
data['predict'] = model.predict(eval_data)

In [8]:
data

Unnamed: 0,Person_id,Survey_date,Round,Status,Tenure,Geography,Province,Matric,Degree,Diploma,...,Math,Mathlit,Additional_lang,Home_lang,Science,Female,Sa_citizen,Birthyear,Birthmonth,predict
0,Id_r90136smvl,2022-08-03,3,other,,Urban,KwaZulu-Natal,1.0,0.0,0.0,...,0 - 29 %,,50 - 59 %,,40 - 49 %,0,1,2002,12,1
1,Id_wawdqhmu6s,2023-03-16,4,unemployed,979.0,Urban,Western Cape,1.0,0.0,0.0,...,,,40 - 49 %,,,1,1,1989,12,0
2,Id_ap2czff2bu,2023-03-14,4,unemployed,339.0,Urban,KwaZulu-Natal,0.0,0.0,0.0,...,,,,,,1,1,1989,12,0
3,Id_uhgink7iha,2023-02-16,4,studying,,Urban,Gauteng,1.0,0.0,0.0,...,,80 - 100 %,60 - 69 %,,,0,1,2002,11,0
4,Id_5j6bzk3k81,2023-03-23,4,unemployed,613.0,Urban,Gauteng,0.0,0.0,0.0,...,,,,,,1,1,1993,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1929,Id_iogk84m72d,2022-08-05,3,unemployed,948.0,Urban,Gauteng,1.0,0.0,0.0,...,0 - 29 %,,50 - 59 %,,30 - 39 %,1,1,2000,1,1
1930,Id_exoifxvj0s,2023-03-06,4,studying,,Urban,Gauteng,1.0,0.0,0.0,...,30 - 39 %,,50 - 59 %,,30 - 39 %,1,1,2000,1,0
1931,Id_ixio0xbvta,2023-02-02,4,unemployed,767.0,Rural,Mpumalanga,1.0,,,...,30 - 39 %,,50 - 59 %,,50 - 59 %,0,1,2000,1,0
1932,Id_3vjt2lnwp3,2023-03-02,4,studying,,Urban,North West,1.0,0.0,0.0,...,,40 - 49 %,,50 - 59 %,,1,1,2000,1,0
