<a href="https://colab.research.google.com/github/ftirado05/Synthetic-Dataset/blob/master/Student_Dropout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Synthetic Dataset - Student Dropout

import pandas as pd
import numpy as np
import random

# Reproducibility
np.random.seed(42)

# Number of records
n = 600

# Demographic variables
ages = np.random.randint(16, 30, n)
gender = np.random.choice(["Male", "Female", "Other"], n, p=[0.48, 0.48, 0.04])
origin = np.random.choice(["Urban", "Rural"], n, p=[0.7, 0.3])

# Academic variables
hs_avg = np.random.uniform(2.0, 5.0, n).round(2)  # high school average
admission_score = np.random.randint(100, 400, n)
semester1 = np.random.uniform(0.0, 5.0, n).round(2)  # first semester notes

# Financial variables
socio_level = np.random.choice(["Low", "Medium", "High"], n, p=[0.4, 0.4, 0.2])
scholarship = np.random.choice([0, 1], n, p=[0.3, 0.7])  # 1=Yes, 0=No
loan = np.random.choice([0, 1], n, p=[0.5, 0.5])  # educational loan

# Target variable (dropout)
dropout = []
for i in range(n):
    prob = 0.3
    if hs_avg[i] < 3.0 or semester1[i] < 3.0:
        prob += 0.3
    if socio_level[i] == "Low" and scholarship[i] == 0:
        prob += 0.2
    dropout.append(1 if random.random() < prob else 0)

# Crear DataFrame
df = pd.DataFrame({
    "Age": ages,
    "Gender": gender,
    "Origin": origin,
    "HS_Average": hs_avg,
    "Admission_Score": admission_score,
    "Semester1_Grade": semester1,
    "Socioeconomic_Level": socio_level,
    "Scholarship": scholarship,
    "Loan": loan,
    "Dropout": dropout
})

# Entering null values
for col in ["HS_Average", "Semester1_Grade", "Admission_Score"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

# Introduce outliers
df.loc[df.sample(frac=0.01).index, "HS_Average"] = 10.0
df.loc[df.sample(frac=0.01).index, "Admission_Score"] = 999

# Save dataset
df.to_csv("synthetic_dataset_dropout.csv", index=False)

df.head(10)


Unnamed: 0,Age,Gender,Origin,HS_Average,Admission_Score,Semester1_Grade,Socioeconomic_Level,Scholarship,Loan,Dropout
0,22,Male,Rural,3.65,312.0,1.84,High,0,0,1
1,19,Male,Rural,2.13,315.0,0.29,Medium,1,0,1
2,28,Female,Urban,3.9,325.0,0.56,Low,1,1,1
3,26,Female,Urban,4.85,230.0,2.58,Medium,0,1,1
4,23,Female,Urban,3.8,175.0,1.34,Medium,1,0,0
5,28,Female,Urban,4.46,385.0,4.18,Low,1,0,0
6,20,Male,Rural,4.65,376.0,0.07,High,1,1,1
7,22,Male,Urban,,255.0,1.9,High,0,1,1
8,25,Female,Urban,2.64,347.0,1.69,Medium,1,0,0
9,18,Male,Urban,3.83,317.0,0.1,Low,1,1,1
