# Task 1 : The CampusPulse Initiative

## Level 0 : df Summary

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
np.random.seed(0)

In [None]:
df = pd.read_csv('dataset.csv')
df

In [None]:
df.columns

In [None]:
df.info()

## Level 1: Variable Identification Protocol 

In [None]:
df.describe()

In [None]:
fig,axes = plt.subplots(nrows = 1, ncols = 3, figsize = (12,4))

for i,ax in enumerate(axes) :
    
    data = df['Feature_{}'.format(i+1)].dropna()
    min_val = int(data.min())
    max_val = int(data.max())
    bins = np.arange(min_val - 0.5, max_val + 1.5, 1)
    ax.hist(df['Feature_{}'.format(i+1)],color='skyblue', edgecolor ='black',bins = bins)
    ax.set_title('Feature_{}'.format(i+1))
    ax.set_xlabel('Values')
    ax.set_ylabel('Frequency')
    ax.set_xticks(np.arange(min_val, max_val + 1))


plt.tight_layout()
plt.savefig('Feature_hist')
plt.show()


It is clear that Feature_2 and Feature_3 are categorical values since they are coded in numbers and have a small range while Feature_1 is a numerical column.
- 💭 Feature_1 has most of the value in the range of 15-18 and follows a perfect normal distribution representing **age**.
- 💭 Feature_2 and Feature_3 can represent **stress levels, screen time, studytime, GPA,etc**.
- 💭 Feature_3 histogram maybe respresent a **subtle bad habit/Negative indicator** since frequency(#student) is going linearly down with increasing values.

In [None]:
fig,axes = plt.subplots(1,2,figsize=(8,4))
df['grades'] = (df['G1']+df['G2']+df['G3'])/3
axes[0].plot(df.groupby('Feature_2')['grades'].mean(),marker='o',color='orange')
axes[0].set_ylim(8,16)
axes[0].set_xticks([1,2,3,4])

axes[0].set_xlabel('Feature_2 Values')
axes[0].set_ylabel('Average Grades')

axes[1].plot(df.groupby('Feature_2')['failures'].mean(),marker='o',color='orange')
axes[1].set_xticks([1,2,3,4])
axes[1].set_ylim(0,1)
axes[1].set_xlabel('Feature_2 Values')
axes[1].set_ylabel('Average Failures')
plt.savefig('Relation of Feature_2 with Grades and Failures')
plt.tight_layout()
plt.show()

- 🤔 We can clearly see from the figure that as the feature 2 value increases the average grade of the student increases and average failures decreases swiftly pointing to the fact that feature 2 can be study time in hours(the values indicate that) or GPA out of 4
- If we consider Feature_2 is GPA it should linearly increase with grades but Value 3 the average grade decrease. So, we can conclude that Feature_2 is indeed studytime in hours as the grades tend to get stagnant after a certain amount of studytime.

In [None]:
fig,axes = plt.subplots(1,3,figsize=(12,4))
df['grades'] = (df['G1']+df['G2']+df['G3'])/3
axes[0].plot(df.groupby('Feature_3')['grades'].mean(),marker='o',color='indigo')
axes[0].set_ylim(8,16)
axes[0].set_xticks([1,2,3,4,5])
axes[0].set_xlabel('Feature_3 Values')
axes[0].set_ylabel('Average Grades')

axes[1].plot(df.groupby('Feature_3')['failures'].mean(),marker='o',color='indigo')
axes[1].set_xticks([1,2,3,4,5])
axes[1].set_ylim(0,1)
axes[1].set_xlabel('Feature_3 Values')
axes[1].set_ylabel('Average Failures')

axes[2].plot(df.groupby('Feature_3')['absences'].mean(),marker='o',color='indigo')
axes[2].set_xticks([1,2,3,4,5])
# axes[1].set_ylim(0,1)
axes[2].set_xlabel('Feature_3 Values')
axes[2].set_ylabel('Average Absences')

plt.tight_layout()
plt.savefig('Relation of Feature_3 with Grades, Failures and Absences')
plt.show()

- 🤔 While figure is negatively correlated with the grades and positively correlated with Average failures and Absences it can be associated with increasing stress levels.

## Level 2: Data Integrity Audit 

In [None]:
for col in df.select_dtypes(include='object'):
    total = df.shape[0]
    count = df[col].value_counts().sum()
    print(df[col].value_counts().to_string())
    print(f"{col}: Missing = {total - count}\n")

In [None]:
# ct = pd.crosstab(df['famsize'], df['romantic'])
# percent_ct = ct.div(ct.sum(axis=1), axis=0) * 100

# print(percent_ct.round(2))
# df.loc[df['famsize'].isna(), 'famsize'] = np.random.choice(['GT3','LE3'],size=df['famsize'].isna().sum(),p=[0.65,0.35])

# print(df['famsize'].value_counts())
df['famsize'] = df['famsize'].fillna(df['famsize'].mode()[0])
col = 'famsize'
total = df.shape[0]
count = df[col].value_counts().sum()
print(f"{col}: Missing = {total - count}\n")

In [None]:
ct = pd.crosstab(df['higher'], df['romantic'])
percent_ct = ct.div(ct.sum(axis=1), axis=0) * 100

print(percent_ct.round(2))
df.loc[(df['higher'].isna()) & (df['romantic']=='yes'), 'higher'] = np.random.choice(['no','yes'],size=((df['higher'].isna()) & (df['romantic'] == 'yes')).sum(),p=[0.64,0.36])
df.loc[(df['higher'].isna()) & (df['romantic']=='no'), 'higher'] = np.random.choice(['no','yes'],size=((df['higher'].isna()) & (df['romantic'] == 'no')).sum(),p=[0.48,0.52])

print(df['higher'].value_counts())
# df['higher'] = df['higher'].fillna(df['higher'].mode()[0])
# col = 'higher'
# total = df.shape[0]
# count = df[col].value_counts().sum()
# print(f"{col}: Missing = {total - count}\n")

Since the output clearly shows the categorical data has no null values except two columns *famsize* and *higher*. The strategy is to fill the missing values with the existing proportions so that it does not skew romantic target column.

In [None]:
for col in df.select_dtypes(exclude='object'):
    print("{} : {}".format(col,df[col].isna().sum()))

Some columns have missing values. So, we will fill the continous data with median/mean values while we will fill categorical data(coded with integers) with mode because mode precisely replicates the data in real life.

In [None]:
to_fill = ['Fedu','traveltime','freetime','absences','G2','Feature_1','Feature_2','Feature_3']
df[to_fill].describe()

In [None]:
from sklearn.impute import SimpleImputer
cat_fill = ['Fedu','traveltime','freetime','Feature_2','Feature_3']
median_cols = ['absences', 'Feature_1']

median_imputer = SimpleImputer(strategy='median')
df[median_cols] = median_imputer.fit_transform(df[median_cols])

df['G2'] = SimpleImputer(strategy='mean').fit_transform(df[['G2']])

mode_imputer = SimpleImputer(strategy='most_frequent')
df[cat_fill] = mode_imputer.fit_transform(df[cat_fill])
for col in df.select_dtypes(exclude='object'):
    print("{} : {}".format(col,df[col].isna().sum()))

In [None]:
df['grades'] = df['G1']+df['G2']+df['G3']
print(df['grades'].isna().sum())

## Level 3: Exploratory Insight Report 

### Q1) Does education of parents affect the grades of the student?

In [None]:
df['Parental_Edu'] = (df['Medu'] + df['Fedu']) / 2
grouped = df.groupby('Parental_Edu')['grades'].mean().reset_index()

plt.figure(figsize=(6,4))
sns.lineplot(data=grouped, x='Parental_Edu', y='grades', marker='o', color='green')

plt.title("Student Grades vs Average Parental Education")
plt.xlabel("Average Parental Education Level (0 = none, 4 = higher ed)")
plt.ylabel("Mean Student Grade")
plt.grid(True)
plt.tight_layout()
plt.savefig("Student Grades vs Average Parental Education")
plt.show()

This plot clearly shows that higher average parental education is associated with better student grades. This trend is likely due to several factors like: better academic support at home, higher expectations, and access to resources.
The strongest gains appear after an average parental education of 2.5, pointing to the fact that educational performance increases substantially when both parents are well-educated.

### Q2) How does relationship status and sex affect the grades of students? 

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='sex', y='grades', hue='romantic', data=df,edgecolor='black')

plt.title('Average Grades by Gender and Relationship Status')
plt.xlabel('Gender (F = Female, M = Male)')
plt.ylabel('Mean Student Grade')
plt.legend(title='In Relationship')
plt.tight_layout()
plt.savefig('Average Grades by Gender and Relationship Status')
plt.show()

- The chart highlights a consistent pattern: students who are not in a romantic relationship tend to perform better academically. The reason for this can be more time is spent to maintain the relation and thus less time is available for academic activities.
- Female students outperform male students regardless of their relationship status.
- The negative impact of relation on grades affects both the gender nearly the same. 

### Q3) How are *famrel*, *Dalc*, *goout*, *health*, *Feature_3(stress level)* associated with each other?

In [None]:
corr = df[['famrel', 'Dalc', 'goout', 'health', 'Feature_3']].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='magma')
plt.title("Lifestyle Factors heatmap")
plt.tight_layout()
plt.savefig("Lifestyle Factors heatmap")
plt.show()

- Dalc vs Feature_3 (Stress level) -> 0.57 <br>This is a strong relationship. Students who drink more alcohol on weekdays also tend to report high amount of stress.
- Dalc vs Goout -> 0.25
<br>Moderate correlation. Students who socialize more often tend to drink more during the week — this is expected as a *party culture*.
- gout vs Feature_3 -> 0.37 <br>Socially active students are also more likely to report high stress levels, reinforcing the idea of *peer pressure and fitting into the society mindset*.

### Q4) How does Parental cohabitation status affect student's Alcohol consumption and Family relationship quality

In [None]:
fig, axes = plt.subplots(1,2,figsize=(8,4))
sns.barplot(x='Pstatus',y='Dalc',ax=axes[0],edgecolor='black',data=df,color='skyblue')
axes[0].set_xlabel('Pstatus(T : Together, A : Apart)')
axes[0].set_ylabel('Alcohol consumption')

sns.barplot(x='Pstatus',y='famrel',ax=axes[1],edgecolor='black',data=df,color='skyblue')
axes[1].set_xlabel('Pstatus(T : Together, A : Apart)')
axes[1].set_ylabel('Family Relationship Quality')

plt.tight_layout()
plt.savefig('Pstatus vs Dalc|Famrel')
plt.show()

- Alcohol consumption of students whose parent live together is greater than the alcohol consumption of students whose parents live apart. The reason for this is that student living with a single parent are more protected and often have less freedom.
- Students with both parents together report slightly higher family relationship quality than those with parents apart. Students with both parents have greater emotional support and family bonding.

### Q5) Relation between Age and Relatioship Status

In [None]:
df['Feature_1'].value_counts()

In [None]:
df['romantic_1'] = df['romantic'].map({'yes': 1, 'no': 0})
sns.barplot(x='Feature_1', y='romantic_1', data=df[df['Feature_1'] <= 19])
plt.ylabel('Relatioship Status')
plt.xlabel('Age')
plt.savefig('Relationship Status vs Age')

The bar chart indicates a strong positive relationship between age and possibility of being in a romantic relationship among students aged 15 to 19. The trend suggests that emotional and social maturity with age plays a key role in relationship formation. Students with increasing age also have greater freedom.

## Level 4: Relationship Prediction Model 

### Data Preparation

In [None]:
df = df.drop(columns=['Parental_Edu','romantic_1'])
df.info()

In [None]:
# numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
# categorical_cols = df.select_dtypes(include='object').columns
# df = pd.get_dummies(df,columns=categorical_cols,drop_first=True,dtype=int)
df['schoolsup'] = df['schoolsup'].map({'yes': 1, 'no': 0})
df['famsup'] = df['famsup'].map({'yes': 1, 'no': 0})
df['paid'] = df['paid'].map({'yes': 1, 'no': 0})
df['activities'] = df['activities'].map({'yes': 1, 'no': 0})
df['nursery'] = df['nursery'].map({'yes': 1, 'no': 0})
df['higher'] = df['higher'].map({'yes': 1, 'no': 0})
df['internet'] = df['internet'].map({'yes': 1, 'no': 0})
df['famsize'] = df['famsize'].map({'LE3': 0, 'GT3': 1})
df['school'] = df['school'].map({'GP': 1, 'MP': 0})
df['sex'] = df['sex'].map({'M': 1, 'F': 0})
df['address'] = df['address'].map({'U': 1, 'R': 0})
df['Pstatus'] = df['Pstatus'].map({'T': 1, 'A': 0})
df['guardian'] = df['guardian'].map({'father': 2, 'mother': 1, 'other':0})
df['Mjob'] = df['Mjob'].map({'teacher': 4, 'health': 3, 'services':2, 'at_home':1, 'other':0})
df['Fjob'] = df['Fjob'].map({'teacher': 4, 'health': 3, 'services':2, 'at_home':1, 'other':0})
df['reason'] = df['reason'].map({'home': 3, 'reputation':2, 'course':1, 'other':0})
df['romantic'] = df['romantic'].map({'yes': 1, 'no': 0})

X = df.drop(columns=['romantic','school','famsize','Fedu','Medu','guardian','Mjob','Fjob','nursery','higher','address','freetime','health','famsup','reason','Feature_2','absences','activities','paid','schoolsup'])
y = df['romantic']
# print("X columns:", X.columns.tolist())

# X = df.drop(['reason','school','Fjob','Mjob','guardian', 'famsize', 'Medu', 'Fedu', 'traveltime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'famrel', 'freetime', 'goout', 'Dalc', 'health', 'absences', 'G1', 'G2', 'G3', 'Feature_1', 'Feature_2', 'Feature_3','romantic'], axis=1)
# y = df['romantic']
print("X columns:", X.columns.tolist())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0,stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

lr_model = LogisticRegression(random_state=0,C=0.1)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_lr))
cm = confusion_matrix(y_test, y_pred_lr)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['no','yes'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=0, n_estimators=200)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print(classification_report(y_test, y_pred_rf))
cm = confusion_matrix(y_test, y_pred_rf)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['no','yes'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)

print(classification_report(y_test, y_pred_knn))
cm = confusion_matrix(y_test, y_pred_knn)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['no','yes'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

### XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=0, objective='binary:logistic')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print(classification_report(y_test, y_pred_xgb))
cm = confusion_matrix(y_test, y_pred_xgb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['no','yes'])
disp.plot(cmap='Blues')
plt.title('XGBoost Confusion Matrix')
plt.show()

## Level 5: Model Reasoning & Interpretation 

In [None]:
'''
Source of the code insight
https://stackoverflow.com/questions/28256058/plotting-decision-boundary-of-logistic-regression
'''
X = df[['goout', 'Feature_3']].copy()
y = df['romantic']

scaler_2 = StandardScaler()
X_scaled = scaler_2.fit_transform(X)

lr_model_2 = LogisticRegression(class_weight='balanced')
lr_model_2.fit(X_scaled, y)

x_vals = sorted(X['goout'].unique())
y_vals = sorted(X['Feature_3'].unique())
xx, yy = np.meshgrid(x_vals, y_vals)
grid = np.c_[xx.ravel(), yy.ravel()]
grid_scaled = scaler_2.transform(grid)
Z = lr_model_2.predict(grid_scaled).reshape(xx.shape)

plt.figure(figsize=(8,6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
plt.scatter(X['goout'], X['Feature_3'], c=y, cmap='coolwarm', edgecolor='k')

plt.xlabel('goout')
plt.ylabel('Stress Levels')
plt.title('Decision Boundary of Logistic Regression (2 Ordinal Features)')
plt.xticks(x_vals)
plt.yticks(y_vals)
plt.grid(True)
plt.colorbar(label='Predicted Class')
plt.show()

Higher stress levels is associated with no relationship while high goout is associated with romantic relationship

In [None]:
import shap
explainer = shap.Explainer(lr_model, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

In [None]:
yes_idx = np.where(y_pred_lr == 1)[0][0]
no_idx = np.where(y_pred_lr == 0)[0][0]   

yes_sample = X_test.iloc[[yes_idx]]
no_sample = X_test.iloc[[no_idx]]

shap_values_yes = explainer(yes_sample)
shap_values_no = explainer(no_sample)

shap.plots.waterfall(shap_values_yes[0])
shap.plots.waterfall(shap_values_no[0])


### SHAP Value Interpretation Report

#### Model: Logistic Regression
Analyze the predictions made by Logistic Regression model using **SHAP**, to understand how each feature contributes to the model's decisions.

---

#### SHAP Beeswarm Plot — Global Feature Importance

| Rank | Feature         | Interpretation |
|------|------------------|----------------|
| 1    | `G2`             | More G2 -> NO |
| 2    | `G3`             | Higher G3 -> NO |
| 3    | `Feature_1`      | Higher Age -> YES |
| 4    | `sex`            | male -> NO |
| 5    | `G1`       | Higher G1 -> YES |

---

#### SHAP Waterfall Plot — Local Explanation (YES)

`f(x) = 3.502` -> **Yes**

| Feature             | Value        | SHAP Contribution |
|---------------------|--------------|-------------------|
| `G3`                | 15           | +1.27             |
| `G2`                | 14           | +1.15             |
| `Feature_1`         | 18           | +0.30             |
| `traveltime`        | 2            | +0.04             |
| `famrel`            | 5            | +0.04             |
| `internet`          | 1            | +0.04             |
| `Dalc`              | 1            | -0.04             |
| `G1`                | 8            | -0.33             |

**Interpretation:**  
The student is predicted **Yes** largely because he has good grades and has a good age to be in a relationship.

---

#### SHAP Waterfall Plot — Local Explanation (NO)

`f(x) = 1.441` -> **No**

| Feature           | Value        | SHAP Contribution |
|-------------------|--------------|-------------------|
| `G1`              | 9            | -0.23             |
| `Feature_1`       | 16           | -0.19             |
| `sex`             | Male         | -0.15             |
| `G2`              | 9            | +0.54             |
| `G3`              | 10           | +0.21             |
| `traveltime`      | 3            | -0.14             |


**Interpretation:**  
This student is predicted **No**, mainly beacause of G1, age and sex. Although his G3 and G2 have positive contributions, they are outweighed by negative factors.

---

#### Conclusion

- **Global SHAP** analysis shows that activities, sex, age and attendance drive most predictions, which is trivial.
- **Local SHAP** helps to understand why student was predicted "Yes" or "No".

## Bonus Level: The Mystery Boundary Match 

### Plot 1
<img src='Plot_1.png' width='600'>

This plot is basically the decision boundaries of **decision tree** because they split the features creating axis parreral boundaries

---
### Plot 2
<img src='Plot_2.png' width='600'>

This plot has the decision boundary of **random forest** because it is similar to Plot 1 but more jagged than plot 1 indicating a collection of more than one decision tree.

---
### Plot 3
<img src='Plot_3.png' width='600'>

This plot can be attributed to **logistic regression with kernels/SVM** due to its shape and smoothness

---
### Plot 4
<img src='Plot_4.png' width='600'>

This plot is associated with **SVM** beacause of smooth decision boundary.

---
### Plot 5
<img src='Plot_5.png' width='600'>

The final plot is basically the decision boundary of **KNN** algorithm with low k value(typically 2 or 3) as the curve is highly overfitted and jagged 