In [1]:
import pandas as pd
#Carga el dataset en un dataframe

df = pd.read_csv("data.csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
#Separa entre X e Y. El target es la columna "income"
X = df.drop("income", axis=1)
y = df["income"]


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns


In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
    ]
)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

#Max_depth en un for:
for depth in [3, 5, 7, 10]:
    clf_tree = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(max_depth=depth, random_state=42))
    ])
    
    clf_tree.fit(X_train, y_train)
    y_pred = clf_tree.predict(X_test)
    
    print(f"\nDecisionTree max_depth={depth}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
    print(classification_report(y_test, y_pred))



DecisionTree max_depth=3
Accuracy: 0.579
              precision    recall  f1-score   support

       <=50K       0.58      0.95      0.72      7416
      <=50K.       1.00      0.06      0.11      3731
        >50K       0.51      0.52      0.52      2352
       >50K.       0.00      0.00      0.00      1154

    accuracy                           0.58     14653
   macro avg       0.52      0.38      0.34     14653
weighted avg       0.63      0.58      0.48     14653



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



DecisionTree max_depth=5
Accuracy: 0.587
              precision    recall  f1-score   support

       <=50K       0.59      0.96      0.73      7416
      <=50K.       0.95      0.08      0.15      3731
        >50K       0.53      0.50      0.52      2352
       >50K.       0.46      0.01      0.02      1154

    accuracy                           0.59     14653
   macro avg       0.63      0.39      0.35     14653
weighted avg       0.66      0.59      0.49     14653


DecisionTree max_depth=7
Accuracy: 0.590
              precision    recall  f1-score   support

       <=50K       0.59      0.96      0.73      7416
      <=50K.       0.95      0.08      0.15      3731
        >50K       0.53      0.52      0.53      2352
       >50K.       0.31      0.02      0.03      1154

    accuracy                           0.59     14653
   macro avg       0.59      0.39      0.36     14653
weighted avg       0.65      0.59      0.50     14653


DecisionTree max_depth=10
Accuracy: 0.592
   

In [7]:
from sklearn.linear_model import LogisticRegression

clf_log = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        solver='lbfgs',
        max_iter=1000,
        random_state=42
    ))
])

clf_log.fit(X_train, y_train)
y_pred_log = clf_log.predict(X_test)

print("Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.3f}")
print(classification_report(y_test, y_pred_log))


Logistic Regression
Accuracy: 0.594
              precision    recall  f1-score   support

       <=50K       0.61      0.93      0.73      7416
      <=50K.       0.91      0.09      0.17      3731
        >50K       0.50      0.62      0.55      2352
       >50K.       0.59      0.02      0.04      1154

    accuracy                           0.59     14653
   macro avg       0.65      0.42      0.38     14653
weighted avg       0.67      0.59      0.51     14653



In [8]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42
    ))
])

clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

print("Random Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
print(classification_report(y_test, y_pred_rf))


Random Forest
Accuracy: 0.593
              precision    recall  f1-score   support

       <=50K       0.60      0.96      0.73      7416
      <=50K.       0.93      0.09      0.16      3731
        >50K       0.53      0.54      0.54      2352
       >50K.       0.83      0.01      0.02      1154

    accuracy                           0.59     14653
   macro avg       0.72      0.40      0.36     14653
weighted avg       0.69      0.59      0.50     14653

