In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_parquet("data/processed/survey_ai_usage_clean_filtered.parquet")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33065 entries, 0 to 33064
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   DevType                 33065 non-null  category
 1   WorkExp                 33065 non-null  float64 
 2   LanguageHaveWorkedWith  33065 non-null  object  
 3   Country                 33065 non-null  category
 4   RemoteWork              33065 non-null  category
 5   Industry                33065 non-null  category
 6   OrgSize                 33065 non-null  category
 7   EdLevel                 33065 non-null  category
 8   AI_Usage                33065 non-null  int64   
 9   NumLanguages            33065 non-null  int64   
dtypes: category(6), float64(1), int64(2), object(1)
memory usage: 1.2+ MB


In [None]:
# Columnas categóricas que queremos convertir
cat_cols = [
    "DevType", "Country", "RemoteWork", "Industry", "OrgSize", "EdLevel"
]

# Convertir a tipo category
for col in cat_cols:
    df[col] = df[col].astype("category")

# Verificamos el resultado
df.dtypes[cat_cols]

df.info()

In [10]:
# === Comparación de modelos (versión simple y robusta) ===
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1) Datos
X = df.drop(columns=['AI_Usage'])
y = df['AI_Usage']

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ('num', StandardScaler(), num_cols)
])

models = {
    'Logistic Regression (sin balance)': LogisticRegression(max_iter=1000),
    'Logistic Regression (balanceado)': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Random Forest (sin balance)': RandomForestClassifier(n_estimators=200, random_state=42),
    'Random Forest (balanceado)': RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
}

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) Entrenar y evaluar
rows = []
for name, model in models.items():
    pipe = Pipeline(steps=[('preprocess', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    rep = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    rows.append({
        'Modelo': name,
        'Accuracy': rep['accuracy'],
        'Precision (1)': rep['1']['precision'],
        'Recall (1)': rep['1']['recall'],
        'F1-score (1)': rep['1']['f1-score']
    })

df_results = pd.DataFrame(rows).sort_values(by='F1-score (1)', ascending=False).reset_index(drop=True)

# 3) Mostrar simple en consola/Notebook (redondeado)
print(df_results.round(3))

# 4) Guardar sin estilos
df_results.round(3).to_csv("docs/assets/charts/model_comparison_table.csv", index=False)
df_results.to_html("docs/assets/charts/model_comparison_table.html",
                   index=False, float_format=lambda x: f"{x:.3f}")


                              Modelo  Accuracy  Precision (1)  Recall (1)  \
0        Random Forest (sin balance)     0.790          0.794       0.992   
1  Logistic Regression (sin balance)     0.791          0.800       0.980   
2         Random Forest (balanceado)     0.788          0.795       0.987   
3   Logistic Regression (balanceado)     0.674          0.845       0.719   

   F1-score (1)  
0         0.882  
1         0.881  
2         0.880  
3         0.777  


In [5]:
df_results

Unnamed: 0,Modelo,Accuracy,Precision (1),Recall (1),F1-score (1)
2,Random Forest (sin balance),0.790262,0.794018,0.991572,0.881867
0,Logistic Regression (sin balance),0.790866,0.800125,0.979889,0.88093
3,Random Forest (balanceado),0.788749,0.794516,0.987933,0.880731
1,Logistic Regression (balanceado),0.674883,0.8456,0.719594,0.777525


In [6]:
df_results.info()
df_results.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 2 to 1
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Modelo         4 non-null      object 
 1   Accuracy       4 non-null      float64
 2   Precision (1)  4 non-null      float64
 3   Recall (1)     4 non-null      float64
 4   F1-score (1)   4 non-null      float64
dtypes: float64(4), object(1)
memory usage: 364.0+ bytes


Unnamed: 0,Accuracy,Precision (1),Recall (1),F1-score (1)
count,4.0,4.0,4.0,4.0
mean,0.76119,0.808565,0.919747,0.855263
std,0.057545,0.024845,0.133525,0.051828
min,0.674883,0.794018,0.719594,0.777525
25%,0.760283,0.794392,0.914815,0.854929
50%,0.789506,0.797321,0.983911,0.88083
75%,0.790413,0.811494,0.988843,0.881164
max,0.790866,0.8456,0.991572,0.881867
