In [None]:
!pip install opendatasets
import opendatasets as od
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

dataset_link="https://www.kaggle.com/competitions/udea-ai4eng-20242"
od.download(dataset_link)
#{"username":"gonzandres","key":"fa2731bdd837290a987f98515ff47e13"}
df =pd.read_csv("/content/udea-ai4eng-20242/train.csv")

### Preprocesing Pipeline


In [2]:
df["RENDIMIENTO_GLOBAL"] = pd.Categorical(
    df["RENDIMIENTO_GLOBAL"],
    categories=["bajo", "medio-bajo", "medio-alto", "alto"],
    ordered=True,
).codes

#### Variables


In [3]:
order_ESTU_PRGM_ACADEMICO_count = ['Muy raro', 'Raro', 'Poco común', 'Moderadamente popular', 'Popular', 'Muy Popular']

order_PERIODO = [
    20183,
    20184,
    20194,
    20195,
    20196,
    20202,
    20203,
    20212,
    20213,
]

order_ESTU_VALORMATRICULAUNIVERSIDAD = [
    "Menos de 500 mil",
    "Entre 500 mil y menos de 1 millón",
    "Entre 1 millón y menos de 2.5 millones",
    "Entre 2.5 millones y menos de 4 millones",
    "Entre 4 millones y menos de 5.5 millones",
    "Entre 5.5 millones y menos de 7 millones",
    "Más de 7 millones",
    "No pagó matrícula",
]

order_ESTU_HORASSEMANATRABAJA = [
    "0",
    "Menos de 10 horas",
    "Entre 11 y 20 horas",
    "Entre 21 y 30 horas",
    "Más de 30 horas",
]

order_FAMI_ESTRATOVIVIENDA = [
    "Estrato 1",
    "Estrato 2",
    "Estrato 3",
    "Estrato 4",
    "Estrato 5",
    "Estrato 6",
    "Sin Estrato",
]

order_FAMI_EDUCACION_PADRES = [
    'Primaria incompleta',
    'Primaria completa',
    'Secundaria (Bachillerato) incompleta',
    'Secundaria (Bachillerato) completa',
    'Técnica o tecnológica incompleta',
    'Técnica o tecnológica completa',
    'Educación profesional incompleta',
    'Educación profesional completa',
    'Postgrado',
    'Ninguno',
]

order_RENDIMIENTO_GLOBAL = ["bajo", "medio-bajo", "medio-alto", "alto"]

ordinal_variables = [
    "ESTU_VALORMATRICULAUNIVERSIDAD",
    "ESTU_HORASSEMANATRABAJA",
    "FAMI_ESTRATOVIVIENDA",
    "FAMI_EDUCACIONPADRE",
    "FAMI_EDUCACIONMADRE",
]

nominal_variables = [
        "ESTU_PRGM_DEPARTAMENTO",
        "FAMI_TIENEINTERNET",
        "ESTU_PAGOMATRICULAPROPIO",
    ]

ordinal_categories = [
    order_ESTU_VALORMATRICULAUNIVERSIDAD,
    order_ESTU_HORASSEMANATRABAJA,
    order_FAMI_ESTRATOVIVIENDA,
    order_FAMI_EDUCACION_PADRES,
    order_FAMI_EDUCACION_PADRES,
]

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Custom transformer using SentenceTransformer and KMeans clustering
class ProgramClusterer(BaseEstimator, TransformerMixin):
    def __init__(self, num_clusters=15, model_name='distiluse-base-multilingual-cased-v2'):
        self.num_clusters = num_clusters
        self.model_name = model_name

    def fit(self, X, y=None):
        # Extract unique programs
        self.programs = X['ESTU_PRGM_ACADEMICO'].unique()

        # Load the SentenceTransformer model
        self.model = SentenceTransformer(self.model_name)

        # Generate embeddings for the unique programs
        self.embeddings = self.model.encode(self.programs)

        # Perform KMeans clustering
        self.kmeans = KMeans(n_clusters=self.num_clusters, random_state=42)
        self.labels = self.kmeans.fit_predict(self.embeddings)

        # Map programs to cluster labels
        self.program_to_cluster = {program: label + 1 for program, label in zip(self.programs, self.labels)}
        return self

    def transform(self, X):
        # Map programs to their clusters
        X['ESTU_PRGM_ACADEMICO'] = X['ESTU_PRGM_ACADEMICO'].map(self.program_to_cluster)

        # Handle programs not seen during fit
        X['ESTU_PRGM_ACADEMICO'].fillna(-1, inplace=True)  # Assign -1 to unseen programs
        return X

# Ordinal encoding setup with handling unknown values
ordinal_imputer = SimpleImputer(strategy='constant', fill_value=-1)
ordinal_pipeline = Pipeline([
    ('imputation', ordinal_imputer),
    ('encoding', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
])

# One-hot encoding for nominal variables
nominal_preprocessing = OneHotEncoder(drop='first', sparse_output=False)

# Column transformer to apply transformations
encoding_transformer = ColumnTransformer([
    ('ordinal', ordinal_pipeline, ordinal_variables),
    ('nominal', nominal_preprocessing, nominal_variables),
])

# Final pipeline with the custom ProgramClusterer
preprocessing_pipeline = Pipeline([
    ('program_clustering', ProgramClusterer(num_clusters=15)),  # Replace the rare programs logic with clustering
    ('encoding', encoding_transformer)
])

# Apply the pipeline to your DataFrame
df_transformed = preprocessing_pipeline.fit_transform(df)

  from .autonotebook import tqdm as notebook_tqdm
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['ESTU_PRGM_ACADEMICO'].fillna(-1, inplace=True)  # Assign -1 to unseen programs


In [5]:
df_transformed = pd.DataFrame(df_transformed)

### Model Training


#### Data Splitting

In [6]:
from sklearn.model_selection import train_test_split

y = df["RENDIMIENTO_GLOBAL"]
X = df_transformed

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)
# (0.125 * 0.8 = 0.1 of the original data for validation)

print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Training set: (484750, 39) (484750,)
Validation set: (69250, 39) (69250,)
Test set: (138500, 39) (138500,)


#### Model Selection

In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost Classifier
xgb_classifier = XGBClassifier(
    n_estimators=100,          # Number of trees
    learning_rate=0.1,         # Learning rate
    max_depth=5,               # Maximum depth of trees
    objective='multi:softmax', # Multi-class classification
    random_state=42
)

# Train the model
xgb_classifier.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# Validate the model
y_val_pred = xgb_classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.5f}")

# Detailed performance
print("\nClassification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))

# Evaluate on the test set
y_test_pred = xgb_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy:.5f}")

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.40401

Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       0.41      0.54      0.47     17231
           1       0.31      0.26      0.28     17107
           2       0.32      0.21      0.25     17137
           3       0.50      0.60      0.54     17775

    accuracy                           0.40     69250
   macro avg       0.39      0.40      0.39     69250
weighted avg       0.39      0.40      0.39     69250


Test Accuracy: 0.40456

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.42      0.54      0.47     34573
           1       0.31      0.26      0.28     34503
           2       0.31      0.21      0.25     34259
           3       0.50      0.61      0.55     35165

    accuracy                           0.40    138500
   macro avg       0.39      0.40      0.39    138500
weighted avg       0.39      0.40      0.39    138500


In [8]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_distributions = {
    'n_estimators': [100, 300, 500],  # Number of trees
    'max_depth': [3, 7, 12],              # Maximum tree depth
    'learning_rate': [0.01, 0.2],    # Learning rate
    # 'subsample': [0.6, 1.0],               # Fraction of samples used for training each tree
    # 'colsample_bytree': [0.6, 1.0],        # Fraction of features used for each tree
    # 'gamma': [0, 1, 5],                         # Minimum loss reduction required for a split
    # 'min_child_weight': [1, 3, 5],              # Minimum sum of instance weights in a child node
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_classifier,
    param_distributions=param_distributions,
    n_iter=8,                  # Number of parameter settings sampled
    cv=2,                       # Cross-validation splits
    scoring='accuracy',         # Evaluation metric
    verbose=2,
    random_state=42,
    n_jobs=6                 # Use all available cores
)

# Fit the random search to the training data
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)


Fitting 2 folds for each of 8 candidates, totalling 16 fits
Best parameters found:  {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Best cross-validation score:  0.4033893759669933


### Kaggle Submission


In [9]:
import pandas as pd

# Load test data
test_data = pd.read_csv("udea-ai4eng-20242/test.csv")

In [10]:
# Apply the pipeline to your DataFrame
df_transformed_test = preprocessing_pipeline.transform(test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['ESTU_PRGM_ACADEMICO'].fillna(-1, inplace=True)  # Assign -1 to unseen programs


In [11]:
# Convert the NumPy array to a DataFrame
df_transformed_test = pd.DataFrame(df_transformed_test)

In [12]:
# Make predictions
predictions = random_search.predict(df_transformed_test)
predictions = predictions.flatten()

In [13]:
# Define the mapping from encoded values to categorical labels
category_labels = ["bajo", "medio-bajo", "medio-alto", "alto"]

# Start with the ID column
submission_df = test_data[["ID"]].copy()

# Map predictions (numeric) back to categorical labels
submission_df["RENDIMIENTO_GLOBAL"] = [
    category_labels[pred] for pred in predictions
]

# Save the submission file
submission_df.to_csv("submission.csv", index=False)

print("Submission file created:")
print(submission_df.head())

Submission file created:
       ID RENDIMIENTO_GLOBAL
0  550236               bajo
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185               bajo


In [14]:
# Save to CSV for submission
submission_df.to_csv("submissionCB.csv", index=False)