In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
dskagglemt_student_performance_data_set_path = kagglehub.dataset_download('dskagglemt/student-performance-data-set')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

In [None]:
raw_data = pd.read_csv("/kaggle/input/student-performance-data-set/student-mat.csv", delimiter=';')

In [None]:
raw_data.info()

In [None]:
raw_data.head()

In [None]:
print(raw_data.columns)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
sns.histplot(raw_data['G1'], kde=True, bins=20, color='skyblue', ax=axes[0])
axes[0].set_title('Distribution of G1 Grades (0 to 20)')
axes[0].set_xlabel('Grade G1')
axes[0].set_ylabel('Frequency')

sns.histplot(raw_data['G2'], kde=True, bins=20, color='lightgreen', ax=axes[1])
axes[1].set_title('Distribution of G2 Grades (0 to 20)')
axes[1].set_xlabel('Grade G2')
axes[1].set_ylabel('Frequency')

sns.histplot(raw_data['G3'], kde=True, bins=20, color='lightcoral', ax=axes[2])
axes[2].set_title('Distribution of G3 Grades (0 to 20)')
axes[2].set_xlabel('Grade G3')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
pd.set_option("display.max_columns",35)

In [None]:
raw_data.head()

In [None]:
duplicates = raw_data.duplicated()
raw_data[duplicates].sum()

In [None]:
missing_data = raw_data.isnull()
plt.figure(figsize=(12, 6))
sns.heatmap(missing_data, cbar=False, cmap='viridis', yticklabels=False, xticklabels=raw_data.columns)
plt.title('Heatmap of Missing Data')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.show()

In [None]:
raw_data.drop(['school','address','age','sex'], axis=1, inplace=True)

In [None]:
binary_cols = ['famsize','Pstatus', 'famsup', 'paid', 'activities','nursery', 'higher', 'internet', 'romantic','schoolsup', 'Fjob','guardian','reason','Mjob']

In [None]:
binary_map = {'yes': 1, 'no': 0,'GT3':1,'LE3':0,'A':1,'T':0,'father':1,'mother':0, 'other':3,
              'course': 0 ,'home':1,'reputation':2, 'at_home':0, 'health':1, 'services':2,'teacher':4}
for col in binary_cols:
    if set(raw_data[col].unique()).issubset(binary_map.keys()):
        raw_data[col] = raw_data[col].map(binary_map)

In [None]:
raw_data['Fjob'].unique(), raw_data['Mjob'].unique()

In [None]:
raw_data['guardian'].unique()

In [None]:
raw_data['Mjob'].unique()

In [None]:
raw_data['reason'].unique()

In [None]:
raw_data.head()

In [None]:
means = raw_data.mean()
std_devs = raw_data.std()
z_scores = (raw_data - means) / std_devs
threshold = 4

outliers = raw_data[(z_scores.abs() > threshold).any(axis=1)]

print(f"Number of outliers: {len(outliers)}")
if not outliers.empty:
    print("\nIndices of outliers:")
    print(outliers.index.tolist())
    print("\nSample of outliers:")
    print(outliers.head())
else:
    print("No outliers detected.")

In [None]:
features = raw_data.drop(columns='G3')
scaler = MinMaxScaler()

normalized_features = scaler.fit_transform(features)
ready_data = pd.DataFrame(normalized_features, columns=features.columns)
ready_data['G3'] = raw_data['G3']

In [None]:
ready_data.head()

# The dataset is ready to use!