In [None]:
!pip install opendatasets
import opendatasets as od
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

dataset_link="https://www.kaggle.com/competitions/udea-ai4eng-20242"
od.download(dataset_link)
#{"username":"gonzandres","key":"fa2731bdd837290a987f98515ff47e13"}
df =pd.read_csv("/content/udea-ai4eng-20242/train.csv")

### Model Training


#### Data Splitting

In [None]:
categorical_features = [
    "ESTU_VALORMATRICULAUNIVERSIDAD",
    "ESTU_HORASSEMANATRABAJA",
    "FAMI_ESTRATOVIVIENDA",
    "FAMI_EDUCACIONPADRE",
    "FAMI_EDUCACIONMADRE",
    "ESTU_PRGM_DEPARTAMENTO",
    "FAMI_TIENEINTERNET",
    "ESTU_PAGOMATRICULAPROPIO",
    "ESTU_PRGM_ACADEMICO"
]

In [3]:
from sklearn.model_selection import train_test_split

y = df["RENDIMIENTO_GLOBAL"]
X = df.drop(columns=['RENDIMIENTO_GLOBAL', 'ID', 'PERIODO'])

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

categorical_indices = [X.columns.get_loc(col) for col in categorical_features]

# Define preprocessing steps for categorical data
categorical_preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat",
            SimpleImputer(strategy="constant", fill_value=-1),
            categorical_features,
        ),
    ]
)

# Define the pipeline
pipeline = Pipeline(
    steps=[
        ("preprocessor", categorical_preprocessor),  # Apply preprocessing
        (
            "model",
            CatBoostClassifier(  # Train CatBoost
                iterations=2000,
                learning_rate=0.1,
                depth=6,
                random_state=42,
                verbose=100,
                loss_function="MultiClass",
                cat_features=categorical_indices
            ),
        ),
    ]
)

pipeline.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.5f}")
print("\nClassification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))

# Evaluate on test set
y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy:.5f}")
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

0:	learn: 1.3396417	total: 1.47s	remaining: 48m 59s
100:	learn: 1.1888249	total: 2m 11s	remaining: 41m 20s
200:	learn: 1.1846538	total: 4m 25s	remaining: 39m 31s
300:	learn: 1.1813760	total: 6m 38s	remaining: 37m 31s
400:	learn: 1.1782750	total: 8m 52s	remaining: 35m 23s
500:	learn: 1.1751493	total: 11m 3s	remaining: 33m 5s
600:	learn: 1.1722698	total: 13m 15s	remaining: 30m 52s
700:	learn: 1.1693741	total: 15m 25s	remaining: 28m 35s
800:	learn: 1.1663037	total: 17m 38s	remaining: 26m 24s
900:	learn: 1.1632171	total: 19m 52s	remaining: 24m 14s
1000:	learn: 1.1603015	total: 22m 4s	remaining: 22m 1s
1100:	learn: 1.1576544	total: 24m 17s	remaining: 19m 49s
1200:	learn: 1.1548847	total: 26m 32s	remaining: 17m 39s
1300:	learn: 1.1518268	total: 28m 47s	remaining: 15m 27s
1400:	learn: 1.1489339	total: 31m 1s	remaining: 13m 15s
1500:	learn: 1.1460924	total: 33m 15s	remaining: 11m 3s
1600:	learn: 1.1432758	total: 35m 31s	remaining: 8m 51s
1700:	learn: 1.1404274	total: 37m 45s	remaining: 6m 38s


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    "model__iterations": [1500],  # Number of trees
    "model__depth": [5, 6, 7],          # Maximum tree depth
    "model__learning_rate": [0.08, 0.1, 0.12]  # Learning rates
}

# Initialize the RandomizedSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=2,                       # Cross-validation splits
    scoring='accuracy',         # Evaluation metric
    verbose=2,
    n_jobs=6                 
)

# Fit the random search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


Fitting 2 folds for each of 9 candidates, totalling 18 fits
0:	learn: 1.3663316	total: 1.56s	remaining: 38m 56s
100:	learn: 1.1964594	total: 2m 18s	remaining: 31m 58s
200:	learn: 1.1904257	total: 4m 36s	remaining: 29m 46s
300:	learn: 1.1875033	total: 6m 54s	remaining: 27m 30s
400:	learn: 1.1856595	total: 9m 10s	remaining: 25m 9s
500:	learn: 1.1840923	total: 11m 25s	remaining: 22m 47s
600:	learn: 1.1827238	total: 13m 43s	remaining: 20m 32s
700:	learn: 1.1813862	total: 16m 2s	remaining: 18m 16s
800:	learn: 1.1801480	total: 18m 17s	remaining: 15m 58s
900:	learn: 1.1788455	total: 20m 35s	remaining: 13m 41s
1000:	learn: 1.1774504	total: 22m 49s	remaining: 11m 22s
1100:	learn: 1.1761288	total: 25m 4s	remaining: 9m 5s
1200:	learn: 1.1747660	total: 27m 22s	remaining: 6m 48s
1300:	learn: 1.1734715	total: 29m 37s	remaining: 4m 31s
1400:	learn: 1.1722924	total: 31m 51s	remaining: 2m 15s
1499:	learn: 1.1709445	total: 34m 5s	remaining: 0us
Best parameters found:  {'model__depth': 6, 'model__iterati

### Kaggle Submission


In [21]:
import pandas as pd                                                                                                 


# Load test data
test_data = pd.read_csv("udea-ai4eng-20242/test.csv")

In [22]:
# Make predictions
predictions = pipeline.predict(test_data)
predictions = predictions.flatten()

In [23]:
# Start with the ID column
submission_df = test_data[["ID"]].copy()

submission_df["RENDIMIENTO_GLOBAL"] = predictions

# Save the submission file
submission_df.to_csv("submission.csv", index=False)

print("Submission file created:")
print(submission_df.head())

Submission file created:
       ID RENDIMIENTO_GLOBAL
0  550236               bajo
1   98545         medio-alto
2  499179               alto
3  782980               bajo
4  785185               bajo


In [24]:
# Save to CSV for submission
submission_df.to_csv("submissionCB.csv", index=False)