In [None]:
# Install any libraries if need be. Comment out after installing
# !pip install xgboost

# ***Load data***

In [1]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from google.colab import files

uploaded = files.upload()

Saving StudentsPerformance.csv to StudentsPerformance.csv


In [2]:
import pandas as pd
import io

# Extract filename
filename = list(uploaded.keys())[0]

#    Load CSV from either:
#      - a filename (string path)
#      - an uploaded file from Colab's files.upload() dict
#      - a Flask file object (from request.files)

if isinstance(uploaded[filename], str):
  # Assume it's a file path
  df = pd.read_csv(uploaded[filename])
elif hasattr(uploaded[filename], 'read'):
  # Flask's file object or BytesIO
  df = pd.read_csv(io.BytesIO(uploaded[filename].read()))
elif isinstance(uploaded[filename], bytes):
  # Bytes directly (Colab uploaded dict value)
  df = pd.read_csv(io.BytesIO(uploaded[filename]))
else:
  raise ValueError("Unsupported file source type.")


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score

# Features and target
features = ['gender', 'parental level of education', 'lunch',
            'test preparation course', 'math score', 'reading score', 'writing score']
target = 'race/ethnicity'

# Encode categorical features
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])

le_parent = LabelEncoder()
df['parental level of education'] = le_parent.fit_transform(df['parental level of education'])

le_lunch = LabelEncoder()
df['lunch'] = le_lunch.fit_transform(df['lunch'])

le_tpc = LabelEncoder()
df['test preparation course'] = le_tpc.fit_transform(df['test preparation course'])

# Encode target
le_target = LabelEncoder()
y = le_target.fit_transform(df[target])
y_cat = to_categorical(y)  # One-hot encoding for ANN

# Bin numeric scores
df['math score'] = (df['math score'] // 10) * 10
df['reading score'] = (df['reading score'] // 10) * 10
df['writing score'] = (df['writing score'] // 10) * 10

# Scale features
scaler = StandardScaler()
df[['math score', 'reading score', 'writing score']] = scaler.fit_transform(df[['math score', 'reading score', 'writing score']])

# Features for model
X = df[features]

# SMOTE for oversampling
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# One-hot encode target
y_res_cat = to_categorical(y_res)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res_cat, test_size=0.2, random_state=42)

# Compute class weights for original distribution
original_class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(original_class_weights))

# Build ANN
ann = Sequential()
ann.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
ann.add(Dense(16, activation='relu'))
ann.add(Dense(y_res_cat.shape[1], activation='softmax'))

# Compile
ann.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history = ann.fit(
    X_train, y_train,
    epochs=100,
    batch_size=16,
    validation_split=0.2,
    class_weight=class_weights_dict)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.2394 - loss: 1.9184 - val_accuracy: 0.2500 - val_loss: 1.6619
Epoch 2/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2131 - loss: 1.8296 - val_accuracy: 0.2695 - val_loss: 1.6651
Epoch 3/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2282 - loss: 1.8245 - val_accuracy: 0.2695 - val_loss: 1.6526
Epoch 4/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2472 - loss: 1.7769 - val_accuracy: 0.2617 - val_loss: 1.6374
Epoch 5/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2524 - loss: 1.7639 - val_accuracy: 0.2773 - val_loss: 1.6367
Epoch 6/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2341 - loss: 1.7639 - val_accuracy: 0.2773 - val_loss: 1.6440
Epoch 7/100
[1m64/64[0m [32m━━━━━━━━━━━━━━━

In [4]:
# Evaluate
y_pred_probs = ann.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("Accuracy:", accuracy_score(y_true, y_pred))

# Ensure target names are strings
target_names = [str(c) for c in le_target.classes_]

print(classification_report(y_true, y_pred, target_names=target_names))

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Accuracy: 0.3542319749216301
              precision    recall  f1-score   support

     group A       0.34      0.83      0.48        63
     group B       0.33      0.26      0.29        65
     group C       0.67      0.03      0.05        70
     group D       0.30      0.11      0.16        63
     group E       0.40      0.60      0.48        58

    accuracy                           0.35       319
   macro avg       0.41      0.37      0.29       319
weighted avg       0.41      0.35      0.29       319



In [6]:
import pickle

# Save the model to a pickle file
with open('ann.pkl', 'wb') as f:
  pickle.dump(ann, f)

# Save the scaler to a pickle file
with open('scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)

# Save the feature names to a pickle file
with open('gender.pkl', 'wb') as f:
  pickle.dump(le_gender, f)

with open('parent.pkl', 'wb') as f:
  pickle.dump(le_parent, f)

with open('lunch.pkl', 'wb') as f:
  pickle.dump(le_lunch, f)

with open('tpc.pkl', 'wb') as f:
  pickle.dump(le_tpc, f)

from google.colab import files
files.download("ann.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>