In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/students-drop-out-prediction/train.csv
/kaggle/input/students-drop-out-prediction/test.csv


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [4]:
train_df = pd.read_csv("../input/students-drop-out-prediction/train.csv")
test_df = pd.read_csv("../input/students-drop-out-prediction/test.csv")

In [5]:
binary_col_names = ["v_1", "v_26", "v_11", "v_14", "v_30", "v_28", "v_9", "v_27", "label"]
non_binary_col_names = list(set(train_df.columns) - set(binary_col_names) - set(["id"]))

In [6]:
# We do not need 'id' column
# Scale non binary features
transform_pipeline = ColumnTransformer(
    [
        ("drop_id", "drop", ["id"]),
        ("scale_non_binary", StandardScaler(), non_binary_col_names)
    ],
    remainder="passthrough"
)
transform_train_data = transform_pipeline.fit_transform(train_df)
feature_names = [name.split("__")[1] for name in transform_pipeline.get_feature_names_out()]
transform_train_df = pd.DataFrame(data=transform_train_data, columns=feature_names)
transform_train_df = transform_train_df.astype(dict([(col, "int") for col in binary_col_names]))

In [7]:
transform_train_df

Unnamed: 0,v_10,v_19,v_31,v_40,v_39,v_13,v_25,v_0,v_35,v_8,...,v_15,v_1,v_11,v_14,v_26,v_27,v_30,v_9,v_28,label
0,-0.345541,0.116531,-0.777790,-1.252232,-0.632603,-0.497793,-1.164142,-0.432218,1.443223,-0.532276,...,-0.236435,1,1,0,0,1,0,0,0,1
1,-0.020789,0.914999,-0.650992,1.421048,-0.231953,2.300727,-1.176768,0.000799,-0.739069,0.123169,...,0.151760,1,0,1,1,0,1,1,0,2
2,-0.345541,0.116531,-0.143800,-0.488438,-0.098403,-0.497793,-1.214645,-0.432218,-1.455134,-0.532276,...,-0.624630,1,1,0,0,1,1,0,0,1
3,0.303963,0.382687,-0.777790,-1.252232,1.103547,-0.497793,-1.214645,-0.432218,-0.466283,-0.532276,...,0.539956,1,1,0,0,1,0,1,0,1
4,1.927721,4.641181,2.138565,-0.870335,-0.766153,5.410194,0.262574,0.433815,-0.977758,2.963432,...,2.480932,1,0,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,-0.345541,-0.149625,-0.650992,0.657253,-0.721637,-0.497793,-1.214645,-0.432218,0.897650,-0.532276,...,0.539956,1,1,0,0,1,1,0,0,2
3792,2.901976,0.914999,-0.270598,1.039150,1.682263,1.989780,-1.151516,0.866831,0.829453,-0.532276,...,3.257323,1,0,1,1,1,1,1,0,2
3793,2.252473,0.382687,-0.270598,1.039150,1.726780,0.435047,-1.214645,3.031913,0.181585,2.526469,...,0.928151,1,0,1,1,1,1,1,0,0
3794,1.278218,1.979622,0.109796,1.039150,1.593230,-0.497793,1.159006,2.165881,0.147487,1.215578,...,2.869128,1,0,1,1,0,1,1,0,1


In [8]:
y_train_com = transform_train_df.pop("label")
X_train_com = transform_train_df
X_train, X_valid, y_train, y_valid = train_test_split(X_train_com, y_train_com, test_size=0.3, random_state=1)

In [9]:
# RBF kernel
model = SVC(random_state=1)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.79      0.70      0.74       375
           1       0.73      0.94      0.82       540
           2       0.61      0.30      0.40       224

    accuracy                           0.73      1139
   macro avg       0.71      0.65      0.65      1139
weighted avg       0.73      0.73      0.71      1139



In [10]:
# Linear kernel
model = SVC(random_state=1, kernel="linear")
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.77      0.74      0.75       375
           1       0.74      0.93      0.82       540
           2       0.63      0.29      0.40       224

    accuracy                           0.74      1139
   macro avg       0.72      0.65      0.66      1139
weighted avg       0.73      0.74      0.72      1139



In [11]:
# Poly kernel, degree = 2
model = SVC(random_state=1, degree=2, kernel="poly")
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.77      0.63      0.69       375
           1       0.67      0.94      0.78       540
           2       0.45      0.16      0.24       224

    accuracy                           0.68      1139
   macro avg       0.63      0.58      0.57      1139
weighted avg       0.66      0.68      0.65      1139



In [12]:
# Poly kernel, degree = 3
model = SVC(random_state=1, degree=3, kernel="poly")
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.76      0.57      0.65       375
           1       0.62      0.93      0.74       540
           2       0.42      0.10      0.16       224

    accuracy                           0.65      1139
   macro avg       0.60      0.53      0.52      1139
weighted avg       0.63      0.65      0.60      1139



# Linear and RBF kernel seem to perform good, now to HPT!

In [13]:
# HPT for rbf kernel
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "gamma": ["scale", 0.01, 0.1],
}
model = SVC(random_state=1, kernel="rbf")
search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring="f1_macro", refit=True)
search.fit(X_train_com, y_train_com)

GridSearchCV(cv=3, estimator=SVC(random_state=1),
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'gamma': ['scale', 0.01, 0.1]},
             scoring='f1_macro')

In [15]:
search.best_params_

{'C': 10, 'gamma': 0.01}

In [16]:
search.best_score_

0.6517243953185

In [14]:
# HPT for linear kernel
# C = 100 for linear kernel is taking too much time
param_grid = {
    "C": [0.01, 0.1, 1, 10]
}
model = SVC(random_state=1, kernel="linear")
search1 = GridSearchCV(model, param_grid=param_grid, cv=3, scoring="f1_macro", refit=True)
search1.fit(X_train_com, y_train_com)

GridSearchCV(cv=3, estimator=SVC(kernel='linear', random_state=1),
             param_grid={'C': [0.01, 0.1, 1, 10]}, scoring='f1_macro')

In [19]:
search1.best_params_

{'C': 10}

In [20]:
search1.best_score_

0.6475590218980157

#### Best results obtained for
#### kernel = rbf
#### C = 10
#### gamma = 0.01
#### Best score is Weighted average F1 score = 0.65

In [21]:
# We do not need 'id' column
# Scale non binary features
transform_test_data = transform_pipeline.fit_transform(test_df)
feature_names = [name.split("__")[1] for name in transform_pipeline.get_feature_names_out()]
transform_test_df = pd.DataFrame(data=transform_test_data, columns=feature_names)
preds = [int(i) for i in search.predict(transform_test_df)]
submission = pd.DataFrame({"id": test_df["id"].to_numpy(), "label": preds})
submission.to_csv("submission.csv", index=False)