In [1]:

%pip install ucimlrepo

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
drug_consumption_quantified = fetch_ucirepo(id=373) 
  
# data (as pandas dataframes) 
X = drug_consumption_quantified.data.features 
y = drug_consumption_quantified.data.targets 
  
# metadata 
print(drug_consumption_quantified.metadata) 
  
# variable information 
print(drug_consumption_quantified.variables) 

{'uci_id': 373, 'name': 'Drug Consumption (Quantified)', 'repository_url': 'https://archive.ics.uci.edu/dataset/373/drug+consumption+quantified', 'data_url': 'https://archive.ics.uci.edu/static/public/373/data.csv', 'abstract': 'Classify type of drug consumer by personality data', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1885, 'num_features': 12, 'feature_types': ['Real'], 'demographics': ['Age', 'Gender', 'Education Level', 'Nationality', 'Ethnicity'], 'target_col': ['alcohol', 'amphet', 'amyl', 'benzos', 'caff', 'cannabis', 'choc', 'coke', 'crack', 'ecstasy', 'heroin', 'ketamine', 'legalh', 'lsd', 'meth', 'mushrooms', 'nicotine', 'semer', 'vsa'], 'index_col': ['id'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Fri Mar 08 2024', 'dataset_doi': '10.24432/C5TC7S', 'creators': ['Elaine Fehrman', 'Vincent Egan', 'Evgeny Mirkes'], 'intro_paper': {'ID': 413, 

In [3]:
# Obtengo el diccionario de codificación del repositorio

# Cargar dataset
drug = fetch_ucirepo(id=373)

# Diccionario de codificación
diccionario = drug.variables

# Mostrar diccionario completo
pd.set_option('display.max_colwidth', None)
diccionario

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,id,ID,Integer,,,,no
1,age,Feature,Continuous,Age,,,no
2,gender,Feature,Continuous,Gender,,,no
3,education,Feature,Continuous,Education Level,,,no
4,country,Feature,Continuous,Nationality,,,no
5,ethnicity,Feature,Continuous,Ethnicity,,,no
6,nscore,Feature,Continuous,,,,no
7,escore,Feature,Continuous,,,,no
8,oscore,Feature,Continuous,,,,no
9,ascore,Feature,Continuous,,,,no


In [4]:
# Dataframe completo

df = drug_consumption_quantified.data.original 
df

Unnamed: 0,id,age,gender,education,country,ethnicity,nscore,escore,oscore,ascore,...,ecstasy,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotine,semer,vsa
0,1,0.49788,0.48246,-0.05921,0.96082,0.12600,0.31287,-0.57545,-0.58331,-0.91699,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.62090,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.63340,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,1884,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,-1.19430,1.74091,1.88511,0.76096,...,CL0,CL0,CL0,CL3,CL3,CL0,CL0,CL0,CL0,CL5
1881,1885,-0.95197,-0.48246,-0.61113,-0.57009,-0.31685,-0.24649,1.74091,0.58331,0.76096,...,CL2,CL0,CL0,CL3,CL5,CL4,CL4,CL5,CL0,CL0
1882,1886,-0.07854,0.48246,0.45468,-0.57009,-0.31685,1.13281,-1.37639,-1.27553,-1.77200,...,CL4,CL0,CL2,CL0,CL2,CL0,CL2,CL6,CL0,CL0
1883,1887,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,0.91093,-1.92173,0.29338,-1.62090,...,CL3,CL0,CL0,CL3,CL3,CL0,CL3,CL4,CL0,CL0


In [5]:
# Dataframe variables features

X.head()

Unnamed: 0,age,gender,education,country,ethnicity,nscore,escore,oscore,ascore,cscore,impuslive,ss
0,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575


In [6]:
# Dataframe variables target

y.head()

Unnamed: 0,alcohol,amphet,amyl,benzos,caff,cannabis,choc,coke,crack,ecstasy,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotine,semer,vsa
0,CL5,CL2,CL0,CL2,CL6,CL0,CL5,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,CL5,CL2,CL2,CL0,CL6,CL4,CL6,CL3,CL0,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,CL6,CL0,CL0,CL0,CL6,CL3,CL4,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,CL4,CL0,CL0,CL3,CL5,CL2,CL4,CL2,CL0,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,CL4,CL1,CL1,CL0,CL6,CL3,CL6,CL0,CL0,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


##### Trabajo solamente con el cannabis como droga target. 

##### Diccionario de la documentación 

| Código  | Categoría original     | Significado                  |
| ------- | ---------------------- | ---------------------------- |
| **CL0** | Never Used             | Nunca ha consumido           |
| **CL1** | Used over a Decade Ago | Consumió hace más de 10 años |
| **CL2** | Used in Last Decade    | Consumió en la última década |
| **CL3** | Used in Last Year      | Consumió en el último año    |
| **CL4** | Used in Last Month     | Consumió en el último mes    |
| **CL5** | Used in Last Week      | Consumió en la última semana |
| **CL6** | Used in Last Day       | Consumió en el último día    |


In [7]:
# Creo un mapa para el cannabis y obteno la varibale target binaria

map_cannabis_binary = {
    "CL0": 0,
    "CL1": 0,
    "CL2": 0,
    "CL3": 1,
    "CL4": 1,
    "CL5": 1,
    "CL6": 1
}

df["cannabis_binary"] = df["cannabis"].map(map_cannabis_binary)

In [8]:
# Comprobacion de balanceo de la variable target cannabis

df["cannabis_binary"].value_counts(normalize=True).round(2) * 100

cannabis_binary
1    53.0
0    47.0
Name: proportion, dtype: float64

In [9]:
# Compruebo si hay duplicados

df.duplicated().sum()

np.int64(0)

In [10]:
# Compruebo si hay nulos

df.isna().sum()


id                 0
age                0
gender             0
education          0
country            0
ethnicity          0
nscore             0
escore             0
oscore             0
ascore             0
cscore             0
impuslive          0
ss                 0
alcohol            0
amphet             0
amyl               0
benzos             0
caff               0
cannabis           0
choc               0
coke               0
crack              0
ecstasy            0
heroin             0
ketamine           0
legalh             0
lsd                0
meth               0
mushrooms          0
nicotine           0
semer              0
vsa                0
cannabis_binary    0
dtype: int64

In [11]:
# Me quedo solo con columnas numéricas

numeric_cols = df.select_dtypes(include="number").columns


# Identifico las variables features (X) y target (y)

X = df[numeric_cols].drop(columns=["cannabis_binary"])
y = df["cannabis_binary"]

In [12]:
# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
# Aplico Estandarización para poner aplicar después KNN

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [14]:
# Aplico KNN

knn = KNeighborsClassifier()
knn.fit(X_train_norm, y_train)

y_pred_knn = knn.predict(X_test_norm)

acc = accuracy_score(y_test, y_pred_knn)
print("Accuracy KNN:", acc)

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred_knn))

print("\nInforme clasificación:")
print(classification_report(y_test, y_pred_knn))

Accuracy KNN: 0.8010610079575596

Matriz de confusión:
[[137  40]
 [ 35 165]]

Informe clasificación:
              precision    recall  f1-score   support

           0       0.80      0.77      0.79       177
           1       0.80      0.82      0.81       200

    accuracy                           0.80       377
   macro avg       0.80      0.80      0.80       377
weighted avg       0.80      0.80      0.80       377



In [15]:
# Aplico Regresion Logística

log_reg = LogisticRegression(max_iter=2000)  
log_reg.fit(X_train_norm, y_train)

y_pred_log = log_reg.predict(X_test_norm)

acc = accuracy_score(y_test, y_pred_log)
print("Accuracy Regresión Logística:", acc)

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred_log))

print("\nInforme clasificación:")
print(classification_report(y_test, y_pred_log))

Accuracy Regresión Logística: 0.830238726790451

Matriz de confusión:
[[144  33]
 [ 31 169]]

Informe clasificación:
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       177
           1       0.84      0.84      0.84       200

    accuracy                           0.83       377
   macro avg       0.83      0.83      0.83       377
weighted avg       0.83      0.83      0.83       377



In [16]:
# Aplico Arbol de decision

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_norm, y_train)

y_pred_tree = tree.predict(X_test_norm)

acc = accuracy_score(y_test, y_pred_tree)
print("Accuracy KNN:", acc)

print("\nMatriz de confusión:")
print(confusion_matrix(y_test, y_pred_tree))

print("\nInforme clasificación:")
print(classification_report(y_test, y_pred_tree))

Accuracy KNN: 0.7559681697612732

Matriz de confusión:
[[135  42]
 [ 50 150]]

Informe clasificación:
              precision    recall  f1-score   support

           0       0.73      0.76      0.75       177
           1       0.78      0.75      0.77       200

    accuracy                           0.76       377
   macro avg       0.76      0.76      0.76       377
weighted avg       0.76      0.76      0.76       377



##### Aplico Esemble

In [17]:
# Bagging

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=100,
    random_state=42
)

bagging.fit(X_train_norm, y_train)
y_pred_bag = bagging.predict(X_test_norm)

print("Accuracy Bagging:", accuracy_score(y_test, y_pred_bag))
print("\nMatriz de confusión (Bagging):")
print(confusion_matrix(y_test, y_pred_bag))
print("\nInforme clasificación (Bagging):")
print(classification_report(y_test, y_pred_bag))

Accuracy Bagging: 0.8090185676392573

Matriz de confusión (Bagging):
[[139  38]
 [ 34 166]]

Informe clasificación (Bagging):
              precision    recall  f1-score   support

           0       0.80      0.79      0.79       177
           1       0.81      0.83      0.82       200

    accuracy                           0.81       377
   macro avg       0.81      0.81      0.81       377
weighted avg       0.81      0.81      0.81       377



In [18]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train_norm, y_train)
y_pred_rf = rf.predict(X_test_norm)

print("Accuracy Random Forest:", accuracy_score(y_test, y_pred_rf))
print("\nMatriz de confusión (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))
print("\nInforme clasificación (Random Forest):")
print(classification_report(y_test, y_pred_rf))

Accuracy Random Forest: 0.8328912466843501

Matriz de confusión (Random Forest):
[[140  37]
 [ 26 174]]

Informe clasificación (Random Forest):
              precision    recall  f1-score   support

           0       0.84      0.79      0.82       177
           1       0.82      0.87      0.85       200

    accuracy                           0.83       377
   macro avg       0.83      0.83      0.83       377
weighted avg       0.83      0.83      0.83       377



In [19]:
# AdaDoost

from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)

ada.fit(X_train_norm, y_train)
y_pred_ada = ada.predict(X_test_norm)

print("Accuracy AdaBoost:", accuracy_score(y_test, y_pred_ada))
print("\nMatriz de confusión (AdaBoost):")
print(confusion_matrix(y_test, y_pred_ada))
print("\nInforme clasificación (AdaBoost):")
print(classification_report(y_test, y_pred_ada))

Accuracy AdaBoost: 0.8169761273209549

Matriz de confusión (AdaBoost):
[[140  37]
 [ 32 168]]

Informe clasificación (AdaBoost):
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       177
           1       0.82      0.84      0.83       200

    accuracy                           0.82       377
   macro avg       0.82      0.82      0.82       377
weighted avg       0.82      0.82      0.82       377



In [20]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    random_state=42
)

gb.fit(X_train_norm, y_train)
y_pred_gb = gb.predict(X_test_norm)

print("Accuracy Gradient Boosting:", accuracy_score(y_test, y_pred_gb))
print("\nMatriz de confusión (Gradient Boosting):")
print(confusion_matrix(y_test, y_pred_gb))
print("\nInforme clasificación (Gradient Boosting):")
print(classification_report(y_test, y_pred_gb))

Accuracy Gradient Boosting: 0.8328912466843501

Matriz de confusión (Gradient Boosting):
[[142  35]
 [ 28 172]]

Informe clasificación (Gradient Boosting):
              precision    recall  f1-score   support

           0       0.84      0.80      0.82       177
           1       0.83      0.86      0.85       200

    accuracy                           0.83       377
   macro avg       0.83      0.83      0.83       377
weighted avg       0.83      0.83      0.83       377



##### Comparacion de todos los modelos

In [21]:
resultados = {
    "KNN": accuracy_score(y_test, y_pred_knn),
    "LogReg": accuracy_score(y_test, y_pred_log),
    "Tree": accuracy_score(y_test, y_pred_tree),
    "Bagging": accuracy_score(y_test, y_pred_bag),
    "RandomForest": accuracy_score(y_test, y_pred_rf),
    "AdaBoost": accuracy_score(y_test, y_pred_ada),
    "GradientBoost": accuracy_score(y_test, y_pred_gb),
}

for modelo, acc in resultados.items():
    print(f"{modelo}: {acc:.4f}")

KNN: 0.8011
LogReg: 0.8302
Tree: 0.7560
Bagging: 0.8090
RandomForest: 0.8329
AdaBoost: 0.8170
GradientBoost: 0.8329


##### Aplico GridSearch sobre los dos mejores modelos: Regresión logística y Gradient Boost

In [22]:
# Grid Search en el modelo Random Forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

param_grid_rf = {
    "n_estimators": [100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_rf.fit(X_train_norm, y_train)

print("Mejores hiperparámetros RF:", grid_rf.best_params_)
print("Mejor accuracy CV RF:", grid_rf.best_score_)

Mejores hiperparámetros RF: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Mejor accuracy CV RF: 0.8434731909088911


In [23]:
# Evaluo el test

best_rf = grid_rf.best_estimator_

y_pred_rf_best = best_rf.predict(X_test_norm)

print("\nAccuracy RF (test):", accuracy_score(y_test, y_pred_rf_best))
print("\nMatriz de confusión RF (test):")
print(confusion_matrix(y_test, y_pred_rf_best))
print("\nInforme clasificación RF (test):")
print(classification_report(y_test, y_pred_rf_best))


Accuracy RF (test): 0.8328912466843501

Matriz de confusión RF (test):
[[140  37]
 [ 26 174]]

Informe clasificación RF (test):
              precision    recall  f1-score   support

           0       0.84      0.79      0.82       177
           1       0.82      0.87      0.85       200

    accuracy                           0.83       377
   macro avg       0.83      0.83      0.83       377
weighted avg       0.83      0.83      0.83       377



In [24]:
# Grid Search en el modelo Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)

param_grid_gb = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1, 0.2],
    "max_depth": [2, 3],
    "subsample": [0.8, 1.0]
}

grid_gb = GridSearchCV(
    estimator=gb,
    param_grid=param_grid_gb,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_gb.fit(X_train_norm, y_train)

print("Mejores hiperparámetros GB:", grid_gb.best_params_)
print("Mejor accuracy CV GB:", grid_gb.best_score_)

Mejores hiperparámetros GB: {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200, 'subsample': 0.8}
Mejor accuracy CV GB: 0.8401729334888121


In [25]:
# Evaluo el test

best_gb = grid_gb.best_estimator_

y_pred_gb_best = best_gb.predict(X_test_norm)

print("\nAccuracy GB (test):", accuracy_score(y_test, y_pred_gb_best))
print("\nMatriz de confusión GB (test):")
print(confusion_matrix(y_test, y_pred_gb_best))
print("\nInforme clasificación GB (test):")
print(classification_report(y_test, y_pred_gb_best))



Accuracy GB (test): 0.8355437665782494

Matriz de confusión GB (test):
[[142  35]
 [ 27 173]]

Informe clasificación GB (test):
              precision    recall  f1-score   support

           0       0.84      0.80      0.82       177
           1       0.83      0.86      0.85       200

    accuracy                           0.84       377
   macro avg       0.84      0.83      0.83       377
weighted avg       0.84      0.84      0.84       377



##### Comparación final de todos los modelos

In [26]:
resultados = {
    "KNN": accuracy_score(y_test, y_pred_knn),
    "LogReg": accuracy_score(y_test, y_pred_log),
    "Tree": accuracy_score(y_test, y_pred_tree),
    "Bagging": accuracy_score(y_test, y_pred_bag),
    "RandomForest_best": accuracy_score(y_test, y_pred_rf_best),
    "AdaBoost": accuracy_score(y_test, y_pred_ada),
    "GradientBoost_best": accuracy_score(y_test, y_pred_gb_best),
}

for modelo, acc in resultados.items():
    print(f"{modelo}: {acc:.4f}")

KNN: 0.8011
LogReg: 0.8302
Tree: 0.7560
Bagging: 0.8090
RandomForest_best: 0.8329
AdaBoost: 0.8170
GradientBoost_best: 0.8355


In [27]:
# Guardo el modelo ya entrenado en un archivo .pkl

from joblib import dump

best_model = best_gb  # cambia el nombre por el tuyo

feature_names = X.columns.tolist()
feature_means = X.mean().to_dict() 

artefacto = {
    "model": best_model,
    "scaler": scaler,           
    "features": feature_names,
    "feature_means": feature_means
}

dump(artefacto, "modelo_cannabis.pkl")
print("Guardado modelo_cannabis.pkl")


Guardado modelo_cannabis.pkl


In [28]:
X.columns

Index(['id', 'age', 'gender', 'education', 'country', 'ethnicity', 'nscore',
       'escore', 'oscore', 'ascore', 'cscore', 'impuslive', 'ss'],
      dtype='object')

In [29]:
df["age"].unique()

array([ 0.49788, -0.07854, -0.95197,  2.59171,  1.09449,  1.82213])

In [30]:
df["gender"].unique()

array([ 0.48246, -0.48246])

In [31]:
df["education"].unique()

array([-0.05921,  1.98437,  1.16365, -1.22751, -1.7379 ,  0.45468,
       -0.61113, -2.43591, -1.43719])

In [32]:
df["country"].unique()

array([ 0.96082,  0.24923, -0.57009, -0.28519, -0.09765,  0.21128,
       -0.46841])

In [33]:
df["ethnicity"].unique()

array([ 0.126  , -0.31685,  0.1144 , -0.22166, -0.50212, -1.10702,
        1.90725])