In [7]:
## importing necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
## importing the necessary data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [11]:
## Necessary functions

########## FUNCTIONS FOR INITIAL PROCESSING OF DATA ######################
## function to get the survey year
def get_survey_year(df):
    return pd.to_datetime(df["Survey_date"]).dt.year



## function to convert score ranges into equivalent grades
def scores_to_grades(df):
    cols = ["Math", "Mathlit", "Additional_lang", "Home_lang", "Science"]
    grade_dict = {
        "0 - 29 %" : "G",
        "30 - 39 %": "F",
        "40 - 49 %": "E",
        "50 - 59 %": "D",
        "60 - 69 %": "C",
        "70 - 79 %": "B",
        "80 - 100 %": "A"
    }
    
    for col in cols:
        for score_range in grade_dict.keys():
            df.loc[df[col] == score_range, col] = grade_dict[score_range]
            
            
            
## function to fill missing values with median
def fill_non_nan_median(df, col, filt_data):
    temp = train[train[filt_data[0]] == filt_data[1]][col]
    df.loc[(df[filt_data[0]] == filt_data[1]) & np.isnan(df[col]), col] = np.median(temp[~np.isnan(temp)])
    
    
            
## function to fill missing tenure values
def fill_tenure(df):
    df["Tenure_missing"] = 0
    df.loc[df["Tenure"].isna(), "Tenure_missing"] = 1
    df.loc[((df["Status"] == "studying") | (df["Status"] == "other")), "Tenure"] = 0
    fill_non_nan_median(df, "Tenure", ("Status", "wage employed"))
    fill_non_nan_median(df, "Tenure", ("Status", "unemployed"))
    fill_non_nan_median(df, "Tenure", ("Status", "self employed"))
    fill_non_nan_median(df, "Tenure", ("Status", "employment programme"))
    df.loc[df["Status"] == "wage and self employed", "Tenure"].fillna(df[df["Status"] == "wage and self employed"]["Tenure"].mean(), inplace=True)
    
    
    
## function to drop columns
def drop_cols(df, cols):
    df.drop(cols, axis = 1, inplace=True)
    
    
    
## function to engineer columns with missing values
def create_missing_feats(df, cols):
    for col in cols:
        missing_feat = f"{col}_missing"
        df[missing_feat] = 0
        df.loc[df[col].isna(), missing_feat] = 1

        
## function to missing feature values with mode
def fill_na_with_mode(df, cols):
    for col in cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
        

        
## function to process data for training
def process_df(df):
    df = df.copy()
    df["Survey_year"] = get_survey_year(df)
    scores_to_grades(df)
    fill_tenure(df)
    df["Tenure"] = np.log1p(df["Tenure"]) ## taking care of skew
    create_missing_feats(df, ["Science", "Home_lang", "Mathlit",
                              "Math", "Degree", "Diploma",
                              "Schoolquintile", "Additional_lang", "Matric"
                             ])
    fill_na_with_mode(df, ["Matric", "Degree", "Diploma"])

    drop_cols(df, [
                    "Person_id", "Survey_date", "Science", "Home_lang", "Mathlit", "Math",
                     "Schoolquintile", "Additional_lang",
                  ]) 

    return df



########## FUNCTIONS FOR INITIAL PROCESSING OF MAKING PREDICTIONS ######################
## function for prediction on test data
def make_preds(data, model, savename):
    ids = data["Person_id"]
    data = data.drop("Person_id", axis = 1)
    data[["Tenure", "Round", "Survey_year", "Birthyear", "Birthmonth"]] = scaler.transform(test[["Tenure", "Round", "Survey_year", "Birthyear", "Birthmonth"]])
    data = pd.get_dummies(data, drop_first=True)
    preds = model.predict_proba(data)
    data_df = pd.DataFrame({"Person_id": ids.values, "Target": preds[:,1]})
    data_df.to_csv(f"submissions/{savename}.csv", index = False)

def ret_preds(data, model):
    ids = data["Person_id"]
    data = data.drop("Person_id", axis = 1)
    data["Tenure"] = scaler.transform(test[["Tenure"]])
    data = pd.get_dummies(data, drop_first=True)
    preds = model.predict_proba(data)
    return preds

def disp_confmat(model, y_true, X):
    y_pred = model.predict(X)
    disp = ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred))
    disp.plot()

In [10]:
## processing the training dataset
train = process_df(train)

## processing the test dataset
ids = test["Person_id"]
test = process_df(test)
# ## saving the processed test data
test = pd.concat([ids,test], axis=1)