#  Classification with an Academic Success

**The aim of this notebook is to analyze this dataset, extract insights and predict if the student will  Graduate, be Enrolled or Dropout from a specific course course**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.set_index('id', inplace = True)

In [None]:
import warnings
warnings.filterwarnings('ignore')

 # **1. EDA**

### 1.1. General Exploration

In [None]:
# Target column One-hot encoding
train_enc= pd.get_dummies(train, columns = ['Target'] ,drop_first= True )

In [None]:
sns.histplot(data=train, x= 'Target')

# majority of students here are graduates

In [None]:
sns.heatmap(train.drop('Target', axis =1).corr(), cmap= 'viridis')

In [None]:
train.hist(bins =20, figsize=(20,10))
plt.tight_layout()

In [None]:
#### Home Country Stats vs. Target

In [None]:
# Home Country statistics
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10,6))
axes=axes.flatten()
country_cols= ['Unemployment rate','GDP','Inflation rate']
for i in range(3):
    sns.kdeplot(data=train, x=country_cols[i], hue='Target', ax=axes[i])


In [None]:
train_enc[['Unemployment rate', 'Inflation rate', 'GDP', 'Target_Graduate']].corr()

In [None]:
for i in range (3):
    g = sns.FacetGrid(train, col='Target', col_wrap=4, height=4, hue='Target')
    g.map(sns.kdeplot, country_cols[i], shade=True)


In [None]:
sns.histplot(train, x= 'Nacionality',hue = 'Target')

In [None]:
train['Nacionality'].value_counts()

Nationality column is not diverse enough, mostly all of students are Portuguese(1).

In [None]:
train.drop('Nacionality', axis= 1, inplace = True)

### 1.2. Family Stats vs. Target

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20,6))
axes=axes.flatten()
Family_cols= ['Mother\'s qualification', 'Father\'s qualification','Mother\'s occupation', 'Father\'s occupation'  ]
for i in range(4):
    sns.kdeplot(data=train, x=Family_cols[i], hue='Target', ax=axes[i])

In [None]:
gg= train_enc.groupby(by= 'Mother\'s qualification')['Target_Graduate'].mean().reset_index().sort_values(by='Target_Graduate',
                                                                                                         ascending = False)
gg
# sns.barplot(gg, x= 'Mother\'s qualification',y='Target_Graduate')



In [None]:
plt.figure(figsize=(10,6))
sns.barplot(gg, x='Mother\'s qualification', y= 'Target_Graduate')
plt.tight_layout()
sns.countplot(train, x= 'Mother\'s qualification' )

In [None]:
arr = np.array([1, 2, 3, 4, 5])

# Calculate variance using numpy
variance = np.var(arr)
variance

In [None]:
value_counts = train[['Mother\'s qualification', 'Father\'s qualification']].stack().value_counts()

# Reset index and rename columns
value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['Qualification', 'Count']

# Style the DataFrame
styled_counts = value_counts_df.style.bar(subset=['Count'], color='#5fba7d')

# Display styled DataFrame
styled_counts

Most of students' parents have basic education.

In [None]:
# train.groupby(by='Target')[['Mother\'s qualification', 'Father\'s qualification']].value_counts()
counts= train.groupby(by='Target')[['Mother\'s qualification','Father\'s qualification']].value_counts().unstack()
counts.head()

In [None]:
sns.scatterplot(train,x='Mother\'s qualification',y='Father\'s qualification', hue='Target')

In [None]:
sns.lineplot(train, x='Mother\'s qualification', y='Curricular units 1st sem (grade)', hue = 'Target', ci=None)

### 1.3. Students status vs Target

In [None]:
train.info()

In [None]:
students_cols = ['Gender','Displaced','Age at enrollment','Tuition fees up to date','Daytime/evening attendance',
                 'Course','Application order','Application mode','Marital status','Educational special needs','Debtor']

train[students_cols]

In [None]:
sns.heatmap(train_enc[students_cols+['Target_Enrolled', 'Target_Graduate']].corr(), cmap='viridis')

#### **1.3.1 Age**

In [None]:
sns.distplot(train['Age at enrollment'])

In [None]:
train.loc[[train['Age at enrollment'].idxmax()]]

In [None]:
train.loc[[train['Age at enrollment'].idxmin()]]

In [None]:
#does age has to do with graduation?

g = sns.FacetGrid(train, col='Target', col_wrap=4, height=4)
g.map(sns.histplot, 'Age at enrollment', kde=True)

There is very low Graduation for Ages 20+

In [None]:
Age = train.groupby(by ='Age at enrollment')['Target'].value_counts().unstack()
Age.head()

In [None]:
#The Age at which there is maximum dropout
Age.loc[[Age['Dropout'].idxmax()]]

In [None]:
#The Age at which there is minimal dropout
Age.loc[[Age['Dropout'].idxmin()]]

In [None]:
#The Age at which there is maximum Graduation
Age.loc[[Age['Graduate'].idxmax()]]

In [None]:
#The Age at which there is minimal Graduation
Age.loc[[Age['Graduate'].idxmin()]]

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=train, x='Target', y='Age at enrollment')
plt.xlabel('Target')
plt.ylabel('Age at Enrollment')


##### Investigating the correlation between Age and Application mode:


In [None]:
train[['Age at enrollment','Application mode']].corr()

In [None]:
app_mode= train.groupby(by='Application mode')['Age at enrollment'].value_counts().unstack()
app_mode.head()

In [None]:
plt.figure(figsize=(10,6))
sns.displot(train, x='Age at enrollment',y='Application mode',cmap='viridis', kind= 'kde')
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,7))
sns.stripplot(train, x='Age at enrollment',y='Application mode',hue = 'Target',jitter=False, s=20, marker="D", linewidth=1,
              alpha=.1)
plt.tight_layout()

##### Investigating the correlation between Age and Marital status:


In [None]:
train[['Age at enrollment','Marital status']].corr()

In [None]:
sns.scatterplot(train, x= 'Marital status', y= 'Age at enrollment', hue= 'Target')

#### 1.3.2 **Gender**

In [None]:
#0 = Female, 1= Male
plt.figure(figsize=(10, 6))

sns.countplot(train,x = 'Gender', hue = 'Target')


In [None]:
train['Gender'].value_counts()

In [None]:
train_enc.info()

In [None]:
g=train_enc.groupby('Gender')['Target_Graduate'].mean().reset_index()

g
#On average, the graduation rate of Females is more than Males

In [None]:
sns.barplot(train_enc.groupby('Gender')['Target_Graduate'].mean().reset_index(),x='Gender', y= 'Target_Graduate' )
plt.title('Graduation rate')
plt.legend(labels=['0: Female','1: Male'], title='Gender')


In [None]:
# low correlation
train[['Gender','Application mode']].corr()

#### 1.3.3 Debtor, Displaced, Tuition fees up to date and other features

In [None]:
train.info()

In [None]:
train.groupby(by='Debtor')['Target'].value_counts().unstack()

In [None]:
g = train_enc.groupby('Debtor')['Target_Graduate'].mean().reset_index()

g

In [None]:
g_deb = train_enc.groupby('Displaced')['Target_Graduate'].mean().reset_index()

g_deb

In [None]:
g_fees = train_enc.groupby('Tuition fees up to date')['Target_Graduate'].mean().reset_index()

g_fees

In [None]:
g_sch = train_enc.groupby('Scholarship holder')['Target_Graduate'].mean().reset_index()

g_sch

In [None]:
g_esn = train_enc.groupby('Educational special needs')['Target_Graduate'].mean().reset_index()
g_esn

In [None]:
g_de = train_enc.groupby('Daytime/evening attendance')['Target_Graduate'].mean().reset_index()
g_de

In [None]:
g_intr = train_enc.groupby('International')['Target_Graduate'].mean().reset_index()

g_intr

In [None]:
    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(10,6))
    axes=axes.flatten()

sns.barplot(train_enc.groupby('Debtor')['Target_Graduate'].mean().reset_index(),x='Debtor', y= 'Target_Graduate', ax =axes[0] )
axes[0].set_title('by Debtor Status')
axes[0].legend(labels=['1: Yes','0: No'],title= 'Debtor', loc='upper right')

sns.barplot(train_enc.groupby('Displaced')['Target_Graduate'].mean().reset_index(),x='Displaced', y= 'Target_Graduate', ax =axes[1] )
axes[1].set_title('by Displaced Status')
axes[1].legend(labels=['1: Yes','0: No'],title = 'Displaced', loc='upper right')

sns.barplot(train_enc.groupby('Tuition fees up to date')['Target_Graduate'].mean().reset_index(),x='Tuition fees up to date', y= 'Target_Graduate', ax =axes[2] )
axes[2].set_title('by Tuition Fees Up to Date Status')
axes[2].legend(labels=['1: Yes','0: No'], title = 'Tuition fees up to date', loc='upper right')

sns.barplot(train_enc.groupby('Scholarship holder')['Target_Graduate'].mean().reset_index(),x='Scholarship holder', y= 'Target_Graduate', ax =axes[3] )
axes[3].set_title('by scholarship status')
axes[3].legend(labels=['1: Yes','0: No'], title = 'Scholarship holder', loc='upper right')

sns.barplot(g_esn,x='Educational special needs', y= 'Target_Graduate', ax =axes[4] )
axes[4].set_title('by ESN')
axes[4].legend(labels=['1: Yes','0: No'], title = 'ESN', loc='upper right')

sns.barplot(g_de,x='Daytime/evening attendance', y= 'Target_Graduate', ax =axes[5] )
axes[5].set_title('by Daytime/Evening attendace')
axes[5].legend(labels=['1: daytime','0: evening'], title = 'Daytime/evening', loc='upper right')


sns.barplot(g_intr,x='International', y= 'Target_Graduate', ax =axes[6] )
axes[6].set_title('by International')
axes[6].legend(labels=['1: Yes','0: No'], title = 'International', loc='upper right')

fig.suptitle('Graduation Rate Analysis by Various Factors', fontsize=20, fontweight='bold')

fig.delaxes(axes[7])
plt.tight_layout()


#### 1.3.4 Marital Status

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(train, x = 'Marital status')

In [None]:
g_mar= train_enc.groupby(by='Marital status')['Target_Graduate'].mean().reset_index()
g_mar

In [None]:
sns.barplot(g_mar, x='Marital status', y='Target_Graduate')
plt.legend(labels=['1: single','2: married','3: widower', '4: divorced',
                   '5: facto union', '6: legally separated'], title = 'Marital status', loc='upper right')

plt.title('Graduation rate by Marital status')

#### 1.3.5 Course

In [None]:
train['Course'].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(train, x='Course', palette ='magma')
plt.tight_layout()

In [None]:
g_course = train_enc.groupby(by= 'Course')['Target_Graduate'].mean().reset_index().sort_values(by='Target_Graduate', ascending = False)
g_course

In [None]:
plt.figure(figsize=(10, 6))

sns.barplot(g_course ,x='Course', y= 'Target_Graduate')


##### Course 9500 has the highest graduation rate while course 39 and 979 has no graduates.

#### 1.3.6 Grades

In [None]:
train.info()

In [None]:
Grades_cols=['Admission grade'    ,'Curricular units 1st sem (credited)', 
    'Curricular units 1st sem (enrolled)', 
    'Curricular units 1st sem (evaluations)', 
    'Curricular units 1st sem (approved)', 
    'Curricular units 1st sem (grade)', 
    'Curricular units 1st sem (without evaluations)', 
    'Curricular units 2nd sem (credited)', 
    'Curricular units 2nd sem (enrolled)', 
    'Curricular units 2nd sem (evaluations)', 
    'Curricular units 2nd sem (approved)', 
    'Curricular units 2nd sem (grade)', 
    'Curricular units 2nd sem (without evaluations)',
            'Previous qualification (grade)']

In [None]:
sns.heatmap(train_enc[Grades_cols+ ['Target_Graduate']].corr(), cmap='viridis')

In [None]:
g1 = train_enc.groupby ('Curricular units 1st sem (credited)')['Target_Graduate'].mean().reset_index()
g2= train_enc.groupby ('Curricular units 2nd sem (credited)')['Target_Graduate'].mean().reset_index()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

sns.barplot(g1, x='Curricular units 1st sem (credited)', y= 'Target_Graduate', ax= axes[0] )
sns.barplot(g2, x='Curricular units 2nd sem (credited)', y= 'Target_Graduate', ax= axes[1])

fig.suptitle('Semester Credits vs Graduation rate', fontsize=20, fontweight='bold')


In [None]:
g= sns.FacetGrid(train, col='Target')
g.map(sns.kdeplot, 'Admission grade')

g= sns.FacetGrid(train, col='Target')

g.map(sns.kdeplot, 'Previous qualification (grade)')

In [None]:
train_enc[['Previous qualification (grade)','Target_Graduate']].corr()

In [None]:
train_enc[['Curricular units 1st sem (approved)','Target_Graduate']].corr()

In [None]:
train_enc[['Curricular units 2nd sem (approved)','Target_Graduate']].corr()

In [None]:
g= sns.FacetGrid(train, col='Target',hue = 'Target')
g.map(sns.kdeplot, 'Curricular units 1st sem (approved)')

g= sns.FacetGrid(train, col='Target', hue = 'Target')

g.map(sns.kdeplot, 'Curricular units 2nd sem (approved)')

Those who graduate have approved credits between 4-10

In [None]:
train_enc[['Curricular units 1st sem (grade)','Curricular units 2nd sem (grade)', 'Target_Graduate']].corr()

In [None]:
g= sns.FacetGrid(train, col='Target',hue = 'Target')
g.map(sns.kdeplot, 'Curricular units 1st sem (grade)')

g= sns.FacetGrid(train, col='Target', hue = 'Target')

g.map(sns.kdeplot, 'Curricular units 2nd sem (grade)')

In [None]:
train_enc[['Curricular units 1st sem (without evaluations)','Curricular units 1st sem (without evaluations)', 'Target_Graduate']].corr()

In [None]:
g_pq= train_enc.groupby(by= 'Previous qualification')['Target_Graduate'].mean().reset_index()
g_pq

In [None]:
sns.barplot(g_pq, x = 'Previous qualification', y= 'Target_Graduate')

In [None]:
g_appmode= train_enc.groupby(by= 'Application mode')['Target_Graduate'].mean().reset_index().sort_values(by='Target_Graduate',
                                                                                                        ascending = False)
g_appmode

In [None]:
sns.barplot(g_appmode, x = 'Application mode', y= 'Target_Graduate')

# 2. Feature Engineering

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
train.info()

In [None]:
drop_cols=['International','Educational special needs','Nacionality']
train.drop(columns= drop_cols, inplace = True)
test.drop(columns= drop_cols, inplace = True)


In [None]:
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder()
train['Target'] = lab_enc.fit_transform(train['Target'])


In [None]:
lab_enc.classes_

In [None]:
train['Target']

In [None]:
test['GDP'].value_counts()

# ML model

In [None]:
sample_subm = pd.read_csv('/kaggle/input/playground-series-s4e6/sample_submission.csv')
sample_subm

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler


X = train.drop(['Target','id'], axis=1)  # Features
y = train[['Target']]  # Multiple target columns

# # Apply oversampling using RandomOverSampler
# oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
# X_resampled, y_resampled = oversampler.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)





print("X_train shape: ", X_train.shape)
print("Y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)


# Example prediction (you would typically use your test data for this)
# test_X = ...  # Prepare your test data similarly
# predictions = model.predict(test_X)

In [None]:
## Testing Different models

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from lightgbm import LGBMClassifier

lgbm_params = {
                 'objective': 'multiclass',
                 'data_sample_strategy': 'goss',
                 'tree_learner': 'feature', 
                 'n_estimators': 743, 
                 'learning_rate': 0.02636616162598401, 
                 'feature_fraction': 0.398183729482288,
                 'lambda_l1': 6.242410039948067e-07,
                 'lambda_l2': 0.4063299210212167, 
                 'num_leaves': 759, 
                 'max_depth': 50, 
                 
                 'colsample_bytree': 0.7975468653525116, 
                 'min_child_samples': 102, 
                 'min_sum_hessian_in_leaf': 5.440582524630883,
                 'min_gain_to_split': 0.7247318987185962, 
                 'max_bin': 156,
                 'top_rate': 0.6132659772851583,
                 'verbose': -1, 
                 'random_state': 42 
}

lgbm = LGBMClassifier(**lgbm_params)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print(class_report,conf_matrix )

In [None]:
test.drop(columns='id', inplace =True)

In [None]:
y_pred = lgbm.predict(test)


In [None]:
y_test=lab_enc.inverse_transform(y_pred)


In [None]:
sample_subm['Target']=y_test
submission=sample_subm
submission.to_csv("submission.csv", index=False)
submission