In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ml
import catboost 
from catboost import Pool, CatBoostClassifier

from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

/kaggle/input/playground-series-s4e6/sample_submission.csv
/kaggle/input/playground-series-s4e6/train.csv
/kaggle/input/playground-series-s4e6/test.csv


In [2]:
class Config: 
    train_path=  '/kaggle/input/playground-series-s4e6/train.csv'
    test_path = '/kaggle/input/playground-series-s4e6/test.csv'

    # cat - categorical
    # nocat - not categorical
    input_columns_params = {
        "Marital status": 'cat', 
        'Application mode': 'nocat', 
        'Application order': 'nocat', 
        'Course': 'nocat', 
        'Daytime/evening attendance': 'cat', 
        'Previous qualification': 'nocat', 
        'Previous qualification (grade)': 'nocat', 
        'Nacionality': 'cat', 
        "Mother's qualification": 'nocat', 
        "Father's qualification": 'nocat', 
        "Mother's occupation": "cat", 
        "Father's occupation": 'cat', 
        'Admission grade': 'nocat',  
        'Educational special needs': 'cat', 
        'Debtor': 'cat', 
        "Tuition fees up to date": 'cat', 
        'Gender': 'cat', 
        'Scholarship holder': 'cat', 
        'Age at enrollment': 'nocat', 
        'International': 'cat', 
        'Curricular units 1st sem (credited)': 'cat',
        'Curricular units 1st sem (enrolled)': 'cat',
        'Curricular units 1st sem (evaluations)': 'nocat',
        'Curricular units 1st sem (approved)': 'cat',
        'Curricular units 1st sem (grade)': 'nocat',
        'Curricular units 1st sem (without evaluations)': 'nocat',

        'Curricular units 2nd sem (credited)': 'cat',
        'Curricular units 2nd sem (enrolled)': 'cat',
        'Curricular units 2nd sem (evaluations)': 'nocat',
        'Curricular units 2nd sem (approved)': 'cat',
        'Curricular units 2nd sem (grade)': 'nocat',
        'Curricular units 2nd sem (without evaluations)': 'nocat',
        'Unemployment rate': 'nocat', 
        'Inflation rate': 'nocat', 
        'GDP': 'nocat', 
    }
    
    output_columns_params = {"Target": 'cat'}
    
    lightgbm_model_params = {
                 'objective': 'multiclass', # multiclass target: 'Graduated', 'Dropout', or 'Enrolled'
                 'data_sample_strategy': 'goss', # Gradient-based One-Sided Sampling
                 'tree_learner': 'feature', # split nodes based on the best feature
                 'n_estimators': 1743, # number of boosting iterations
                 'learning_rate': 0.02636616162598401, # step size for updatig model weights
                 'feature_fraction': 0.298183729482288, # about 30% of features considered at each split
                 'lambda_l1': 8.242410039948067e-07, # L1 regulation penalization - adding magnitude of weights to the loss
                 'lambda_l2': 0.4063299210212167, # L2 regulation penalization = adding the square of weights to the loss
                 'num_leaves': 699, # Maximum number of leaves (terminal nodes) to use
                 'max_depth': 8, # Maximum tree depth (levels) allowed
                 'colsample_bytree': 0.7975468653525116, # proportion of samples to randomly choose at each iteration
                 'min_child_samples': 102, # Minimum number of samples needed per leaf
                 'min_sum_hessian_in_leaf': 5.440582524630883, # Minimum sum of squared gradients allowed in a leaf node
                 'min_gain_to_split': 0.7247318987185962, # Minumum gain (model score improvement) to make further leaf partitions
                 'max_bin': 156, # Maximum numer of bins used for discretitizing features before tree splits
                 'top_rate': 0.6132659772851583, # Top proportion of features to choose (~61%)
                 'verbose': -1, # Turn off warnings and model logs for a cleaner look
                 'random_state': 1,  # Random state value for repeatablity,
                'early_stopping_rounds': 300, 
}
    
    n_splits = 12
    n_repeats = 1
    seed = 42
    
cfg= Config() 

In [3]:
df = pd.read_csv(cfg.train_path)
test_df = pd.read_csv(cfg.test_path)
df.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [4]:
def add_economic_interaction_features(df):    
    df['Economic_interaction_add'] = df['Unemployment rate'] * df['Inflation rate'] * df['GDP']
    df['Economic_interaction_add'] = df['Unemployment rate'] + df['Inflation rate'] + df['GDP']
    df['Economic_interaction_sub'] = df['Unemployment rate'] - df['Inflation rate'] - df['GDP']  
    df['Economic_interaction_div1'] = df['Unemployment rate'] / (df['Inflation rate'] + 1e-6)
    #df['Economic_interaction_div2'] = df['Inflation rate'] / (df['GDP'] + 1e-6)
    #df['Economic_interaction_div3'] = df['GDP'] / (df['Unemployment rate'] + 1e-6)   
    df['Economic_interaction_comb1'] = (df['Unemployment rate'] + df['Inflation rate']) * df['GDP']
    df['Economic_interaction_comb2'] = (df['Unemployment rate'] * df['Inflation rate']) + df['GDP']
    df['Economic_interaction_exp1'] = df['Unemployment rate'] * df['Inflation rate']
    df['Economic_interaction_exp2'] = df['GDP'] * df['Unemployment rate']
    df['Economic_interaction_exp3'] = df['GDP'] * df['Inflation rate']
    #df['Economic_interaction_log1'] = np.log1p(df['Unemployment rate']) + np.log1p(df['Inflation rate']) + np.log1p(df['GDP'])
    #df['Economic_interaction'] = (df['Unemployment rate'] * df['Inflation rate'] * df['GDP'])
    #df['Economic_interaction_sqrt1'] = np.sqrt(df['Unemployment rate']) + np.sqrt(df['Inflation rate']) + np.sqrt(df['GDP'])
   # df['Economic_interaction_sqrt2'] = np.sqrt(df['Unemployment rate'] * df['Inflation rate'] * df['GDP'])
    #df['Economic_interaction_poly1'] = df['Unemployment rate']**2 + df['Inflation rate']**2 + df['GDP']**2
    #df['Economic_interaction_poly2'] = (df['Unemployment rate'] * df['Inflation rate']) + (df['Inflation rate'] * df['GDP']) + (df['GDP'] * df['Unemployment rate'])
    return df     


def add_curriculum_interaction_features(df):
    df['Curriculum_interaction_1st_sem'] = df['Curricular units 1st sem (credited)'] * df['Curricular units 1st sem (enrolled)'] * df['Curricular units 1st sem (evaluations)'] * df['Curricular units 1st sem (approved)']
    df['Curriculum_interaction_2nd_sem'] = df['Curricular units 2nd sem (credited)'] * df['Curricular units 2nd sem (enrolled)'] * df['Curricular units 2nd sem (evaluations)'] * df['Curricular units 2nd sem (approved)']

def add_grade_interaction_features(df):
    df['Grade_interaction_1st_sem'] = df['Admission grade'] * df['Curricular units 1st sem (grade)']
    df['Grade_interaction_2nd_sem'] = df['Admission grade'] * df['Curricular units 2nd sem (grade)']

def add_qualification_interaction_features(df):
    df['Qualification_interaction'] = df['Previous qualification'] * df['Admission grade']
    df['Course_interaction'] = df['Previous qualification'] * df['Course']
    
def add_occupation_interaction_features(df):
    df["Occupation_interaction"] = df["Mother's occupation"] * df["Father's occupation"]

def add_enrollment_evaluation_interaction_features(df):
    df['Enrollment_evaluation_interaction_1st_sem'] = df['Curricular units 1st sem (enrolled)'] * df['Curricular units 1st sem (evaluations)']
    df['Enrollment_evaluation_interaction_2nd_sem'] = df['Curricular units 2nd sem (enrolled)'] * df['Curricular units 2nd sem (evaluations)']

def add_gender_marital_interaction_features(df):
    df['Gender_marital_interaction'] = df['Gender'] * df['Marital status']

def add_tuition_scholarship_interaction_features(df):
    df['Tuition_scholarship_interaction'] = df['Tuition fees up to date'] * df['Scholarship holder']  

In [5]:
def feature_engineer_train(df_train):    
    #df_train = handle_skewed_columns(df_train)   
    
    add_curriculum_interaction_features(df_train)
    add_grade_interaction_features(df_train)
#     add_age_interaction_features(df_train)
    add_economic_interaction_features(df_train)
    add_qualification_interaction_features(df_train)
    add_occupation_interaction_features(df_train)
    add_enrollment_evaluation_interaction_features(df_train)
    add_gender_marital_interaction_features(df_train)
    add_tuition_scholarship_interaction_features(df_train)
    return df_train

def prepare_X_1(df): 
    X = df[[k for k, v in cfg.input_columns_params.items()]]
    X = feature_engineer_train(X)
    # Convert real numbers for catboost 
    for i in [k for k, v in cfg.input_columns_params.items() if v == 'cat']: 
        X[i] = X[i].apply(lambda i: str(i))
        
    return X 

In [6]:
X = prepare_X_1(df)
y = df[[k for k, v in cfg.output_columns_params.items()]]

test_X = prepare_X_1(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Curriculum_interaction_1st_sem'] = df['Curricular units 1st sem (credited)'] * df['Curricular units 1st sem (enrolled)'] * df['Curricular units 1st sem (evaluations)'] * df['Curricular units 1st sem (approved)']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Curriculum_interaction_2nd_sem'] = df['Curricular units 2nd sem (credited)'] * df['Curricular units 2nd sem (enrolled)'] * df['Curricular units 2nd sem (evaluations)'] * df['Curricular units 2nd sem (approved)']
A value is trying to be set on a copy 

In [7]:
# Store scores
results = []

# LGBoost

In [8]:
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm 

tqdm.pandas() 

class prepare_X_2(): 
    def __init__(self): 
        self.encoders = dict()
        
        
    def one_hot_encode_column(self, df, col_name):
        """
        One-hot encodes the specified column in the given DataFrame.

        Parameters:
        df (pd.DataFrame): The input DataFrame.
        col_name (str): The name of the column to one-hot encode.

        Returns:
        pd.DataFrame: The DataFrame with the specified column one-hot encoded.
        """
        df = df.copy()
        # Initialize new the OneHotEncoder
        encoder = OneHotEncoder(sparse_output=False, drop='if_binary', handle_unknown ='ignore')  # drop='first' to avoid multicollinearity

        # Fit and transform the specified column
        encoded_col = encoder.fit(df[[col_name]])
        self.encoders[col_name] = encoder
        
        return f"{col_name} encoder succesfully fitted"
    
    def fit(self, X): 
        for i in [k for k, v in cfg.input_columns_params.items() if v == 'cat']: 
            self.one_hot_encode_column(df = X, col_name = i)
        return X 
    
    def __call__(self, X): 
        df = X.copy()
        for col_name in [k for k, v in cfg.input_columns_params.items() if v == 'cat']:
            
            encoder = self.encoders[col_name]
            encoded_col = encoder.transform(df[[col_name]])
            # Create a DataFrame with the encoded columns
            encoded_df = pd.DataFrame(encoded_col, columns=encoder.get_feature_names_out([col_name]), index=df.index)

            # Drop the original column and concatenate the encoded columns
            df = df.drop(columns=[col_name])
            df = pd.concat([df, encoded_df], axis=1)
            
        return df 
    
    def __str__(self): 
        return ", ".join([k for k, v in self.encoders.items()])

# train Encoders 
prep2 = prepare_X_2() 
prep2.fit(pd.concat([X, test_X]))

X = prep2(X)

y_uniques = y['Target'].unique()    
y['Target'] = y['Target'].progress_apply(lambda x: [i for i, v in enumerate(y_uniques) if x == v][0])

100%|██████████| 76518/76518 [00:00<00:00, 203511.29it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Target'] = y['Target'].progress_apply(lambda x: [i for i, v in enumerate(y_uniques) if x == v][0])


In [9]:
from lightgbm import LGBMClassifier 

# K-Fold Cross-Validation

kf = RepeatedStratifiedKFold(n_splits=cfg.n_splits, n_repeats=cfg.n_repeats, random_state=cfg.seed)


for train_size, test_size in kf.split(X, y): 
    train_X, train_y = X.iloc[train_size], y.iloc[train_size]
    test_X, test_y = X.iloc[test_size], y.iloc[test_size]

    clf = LGBMClassifier(**cfg.lightgbm_model_params)

    # Train the model
    clf.fit(train_X, train_y['Target'], eval_set=[(test_X, test_y['Target'])])
    
    # Make predictions
    y_pred = clf.predict(test_X)
    
    # Evaluate the model
    accuracy = accuracy_score(test_y, y_pred)
    print("Model accuracy", accuracy)
    results.append({'model': clf, 'accuracy': accuracy, 'type': 'lgbm'}) 

Model accuracy 0.8317390622549788
Model accuracy 0.8344048925827191
Model accuracy 0.8378547906539126
Model accuracy 0.8353457738748628
Model accuracy 0.8317390622549788
Model accuracy 0.8293868590246197
Model accuracy 0.8310853199498118
Model accuracy 0.8312421580928482
Model accuracy 0.8306148055207027
Model accuracy 0.8354767879548306
Model accuracy 0.8370451693851945
Model accuracy 0.8290464240903388


# Picking the best model

In [10]:
# Function to sort the list of dictionaries by a specified key
def sort_by_key(results, key):
    return sorted(results, key=lambda x: x[key], reverse=True)

# Sort results by "accuracy"
sorted_results = sort_by_key(results, "accuracy")

# Print sorted results
print(sorted_results[:5])
best = sorted_results[0]

[{'model': LGBMClassifier(colsample_bytree=0.7975468653525116, data_sample_strategy='goss',
               early_stopping_rounds=300, feature_fraction=0.298183729482288,
               lambda_l1=8.242410039948067e-07, lambda_l2=0.4063299210212167,
               learning_rate=0.02636616162598401, max_bin=156, max_depth=8,
               min_child_samples=102, min_gain_to_split=0.7247318987185962,
               min_sum_hessian_in_leaf=5.440582524630883, n_estimators=1743,
               num_leaves=699, objective='multiclass', random_state=1,
               top_rate=0.6132659772851583, tree_learner='feature', verbose=-1), 'accuracy': 0.8378547906539126, 'type': 'lgbm'}, {'model': LGBMClassifier(colsample_bytree=0.7975468653525116, data_sample_strategy='goss',
               early_stopping_rounds=300, feature_fraction=0.298183729482288,
               lambda_l1=8.242410039948067e-07, lambda_l2=0.4063299210212167,
               learning_rate=0.02636616162598401, max_bin=156, max_depth=8,

# Making an submission

In [11]:
test_df = pd.read_csv(cfg.test_path)
test_X = prepare_X_1(test_df)

for i, best in enumerate(sorted_results[:5]): 
    if best['type'] == 'lgbm': 
        test_inputs = prep2(test_X)
        preds = best['model'].predict(test_inputs)

        preds = [y_uniques[i] for i in preds]
    elif (best['type'] == 'catboost') or (best['type'] == 'xgboost'): 
        preds = best['model'].predict(test_X)
        preds = [i[0] for i in preds]
    else: 
        print('model predict still dont writed')
    
    submission = pd.DataFrame({'id': test_df.id, 'Target': preds})
    submission.to_csv(f"submission{i}.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Curriculum_interaction_1st_sem'] = df['Curricular units 1st sem (credited)'] * df['Curricular units 1st sem (enrolled)'] * df['Curricular units 1st sem (evaluations)'] * df['Curricular units 1st sem (approved)']
