**Import Libraries**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
!pip install xgboost==1.7.5

Collecting xgboost==1.7.5
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.0.2
    Uninstalling xgboost-2.0.2:
      Successfully uninstalled xgboost-2.0.2
Successfully installed xgboost-1.7.5


In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Load Data**

In [None]:
df = pd.read_csv("drive/MyDrive/pi/Train.csv")
tdf = pd.read_csv("drive/MyDrive/pi/Test.csv")
ss = pd.read_csv("drive/MyDrive/pi/SampleSubmission.csv")

In [None]:
validation=0

In [None]:
len(tdf)

In [None]:
df["Survey_date"].nunique()

In [None]:
if validation==1:
    df, tdf = train_test_split(df,test_size=0.33,stratify=df.Target,random_state=0)
    ss=tdf[ss.columns]
    tdf["Target"]=np.nan

In [None]:
df["test"]=0
tdf["test"]=1

In [None]:
df=df.append(tdf)

In [None]:
df

Feature Engineering

In [None]:

num_cols = ['Matric', 'Degree', 'Diploma', 'Schoolquintile',
       'Female', 'Sa_citizen', 'Birthyear', 'Birthmonth']
cat_cols=[
    'Status', 'Geography', 'Province', 'Math',
          'Round', 'Tenure',
       'Mathlit', 'Additional_lang', 'Home_lang', 'Science']
for col in cat_cols:
    print(col,df[col].nunique())
    for feat in num_cols:
        for stat in ["std","mean"]:
            df[f"{col}_{feat}_f{stat}"] = df.groupby(col)[feat].transform(stat)



In [None]:
num_cols = [
   'Matric', 'Degree', 'Diploma', 'Schoolquintile',
       'Female', 'Sa_citizen', 'Birthyear', 'Birthmonth']
c_cols=[
    'Status', 'Geography', 'Province', 'Math',   'Round',
       'Mathlit', 'Additional_lang', 'Home_lang', 'Science']
for i,a in enumerate(c_cols):
    for j,b in enumerate(c_cols):
        if i<j:
            for feat in num_cols:
                for stat in ["std","mean"]:
                    df[f"{a}_{b}_{feat}_f{stat}"] = df.groupby(col)[feat].transform(stat)

In [None]:
cat_cols=['Survey_date', 'Status', 'Geography', 'Province', 'Math',
       'Mathlit', 'Additional_lang', 'Home_lang', 'Science']

In [None]:
num_cols = ['Round', 'Tenure', 'Matric', 'Degree', 'Diploma', 'Schoolquintile',
       'Female', 'Sa_citizen', 'Birthyear', 'Birthmonth']

In [None]:
drop_cols=["Person_id","Target","test"]

In [None]:

from sklearn.preprocessing import OrdinalEncoder


def feature_engineering(train):

    ordinal_encoder = OrdinalEncoder()
    train["Geography"] = ordinal_encoder.fit_transform(train[["Geography"]])
    #test["Geography"] = ordinal_encoder.transform(test[["Geography"]])
    train["Province"] = ordinal_encoder.fit_transform(train[["Province"]])
    #test["Province"] = ordinal_encoder.transform(test[["Province"]])


    # Create a list of categorical variables
    groupby_features = ["Round"]
    for featureg in groupby_features:
        for feature in ["Tenure", "Age_survey","Province","Matric","Degree","Diploma"]:
            train[f'{featureg}_{feature}_mean'] = train[feature] - train[featureg].map(train.groupby(featureg)[feature].mean())
            #test[f'{featureg}_{feature}_mean'] = test[feature] - test[featureg].map(train.groupby(featureg)[feature].mean())

            if feature =="Tenure":
                train[f'{featureg}_{feature}_std_dif'] = train[feature] - train[featureg].map(train.groupby(featureg)[feature].std())
                #test[f'{featureg}_{feature}_std_dif'] = test[feature] - test[featureg].map(train.groupby(featureg)[feature].std())
    #test = test.fillna(0)
    train = train.fillna(0)

    train = one_hot_encode(train, ["Geography","Province"])
    #test = one_hot_encode(test, ["Geography","Province"])

    return train#, test

def preprocess_data(data):

    data['Survey_date'] = pd.to_datetime(data['Survey_date'])
    data['year'] = data['Survey_date'].dt.year
    data['Age_survey'] = data['year'] - data['Birthyear']
    data = data.drop(['Sa_citizen',"Survey_date"], axis=1)

    data['Subjects_over_70'] = data.apply(lambda row: row.str.contains("80 - 100 %|70 - 79 %").sum(), axis=1)
    data['div'] = data['Tenure'] / data['Age_survey']
    data['Tenure_div'] = data['Tenure'] / data['Round']


    #calculate difference between number of days between survey and matric
    data['Higher_education'] = ((data['Degree'] == 1) | (data['Diploma'] == 1) | (data['Matric'] == 1)).astype(int)
    data['Education_progression'] = data['Higher_education'].diff().fillna(0)


    return data


def encode(data):

    # Create a list of categorical variables
    selected_vars = ["Schoolquintile", "Math", "Mathlit", "Additional_lang", "Home_lang", "Science","Status"]

    # Convert character variables to dummy variables
    data = one_hot_encode(data,selected_vars)

    data.fillna(0, inplace=True)

    to_drop = ["Math_80_100"]
    data = data.drop(to_drop, axis=1)

    return data

def one_hot_encode(data, features):
    data = pd.get_dummies(data, columns=features, drop_first=True, dummy_na=True)

    data.columns = data.columns.str.replace(' ', '_')  # Replace spaces with underscores
    data.columns = data.columns.str.replace('[^\w\s]', '', regex=True)  # Remove special characters
    data.columns = data.columns.str.replace('_+', '_', regex=True)  # Replace consecutive underscores with a single underscore
    data.columns = data.columns.str.rstrip('_')  # Remove trailing underscores at the end

    return data




df = preprocess_data(df)

df = encode(df)


df =feature_engineering(df)
df

In [None]:
tdf=df[df.test==1]
df=df[df.test==0]

**Modelling**

In [None]:
import lightgbm
from lightgbm import LGBMClassifier

In [None]:
lightgbm.__version__

In [None]:
model1 = LGBMClassifier(n_estimators=50,metrics='auc')


In [None]:
import xgboost
from xgboost import XGBClassifier
model2 = XGBClassifier(
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    subsample=0.9
)

In [None]:
cols = set(df.columns.tolist())-set(drop_cols)
cols=list(cols)

In [None]:
na_val = np.nan

In [None]:
model1.fit(df[cols].fillna(na_val),df["Target"])
model2.fit(df[cols].fillna(na_val),df["Target"])

In [None]:
# Predict & Blend

In [None]:
tdf["Target1"] = model1.predict_proba(tdf[cols].fillna(na_val))[:,1]
tdf["Target2"] = model2.predict_proba(tdf[cols].fillna(na_val))[:,1]
tdf["Target"] = (tdf.Target1*0.5)+(tdf.Target2*0.5)

In [None]:
if validation==1:
    sub = pd.merge(ss,tdf[ss.columns],on="Person_id",how="left")
    print(roc_auc_score(sub["Target_x"],sub["Target_y"]))
else:
    tdf[ss.columns].to_csv("insight.csv",index=None)

In [None]:
tdf[ss.columns]