In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo

# Fetch the dataset
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 

# Get features and target variables
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 

# Display information
print(f"Data shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")

first_sem_features = [col for col in X.columns if '1st sem' in col]
print("First semester features: ", first_sem_features)

# List of features to drop
features_to_drop = ['Nacionality', 'Father\'s occupation'] + first_sem_features
X_filtered = X.drop(columns=features_to_drop)

print(f"Original features: {X.shape[1]}")
print(f"Filtered features: {X_filtered.shape[1]}")
print(f"Removed features: {features_to_drop}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y.values.ravel(), test_size=0.25, random_state=42, stratify=y)

# Scale numeric features
scaler = StandardScaler()
X_trained_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_trained_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Accuracy evaluation (number of correct predictions / total number of predictions)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


Data shape: (4424, 36)
Target distribution: Target  
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64
First semester features:  ['Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)']
Original features: 36
Filtered features: 28
Removed features: ['Nacionality', "Father's occupation", 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)']

Accuracy: 0.7604
