We will train an XGBoost model on the Adult's Income dataset and deploy it on Hugging Face spaces.

In [None]:
!pip install xgboost
!pip install scikit-learn

In [None]:
!wget http://www.donlapark.cmustat.com/Income.csv

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from xgboost import XGBClassifier


EDU_DICT = {'Preschool': 1,
            '1st-4th': 2,
            '5th-6th': 3,
            '7th-8th': 4,
            '9th': 5,
            '10th': 6,
            '11th': 7,
            '12th': 8,
            'HS-grad': 9,
            'Some-college': 10,
            'Assoc-voc': 11,
            'Assoc-acdm': 12,
            'Bachelors': 13,
            'Masters': 14,
            'Prof-school': 15,
            'Doctorate': 16
            }


X_train = pd.read_csv('Income.csv')

X_train

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,income
0,90,?,HS-grad,Widowed,?,Not-in-family,White,Female,40,United-States,<=50K
1,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,18,United-States,<=50K
2,66,?,Some-college,Widowed,?,Unmarried,Black,Female,40,United-States,<=50K
3,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,40,United-States,<=50K
4,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,40,United-States,<=50K
32557,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K
32558,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K
32559,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K


In [None]:
y_train = X_train.pop("income")
y_train = (y_train == ">50K").astype(int)
X_train['education'].replace(EDU_DICT, inplace=True)

# Names of numerical features
num_col = X_train.select_dtypes(include=['int64', 'float64']).columns
# Names of categorical features
cat_col = X_train.select_dtypes(include=['object', 'bool']).columns

print(num_col)
print(cat_col)

Index(['age', 'education', 'hours.per.week'], dtype='object')
Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')


In [None]:
preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col),
                                  ("onehot", OneHotEncoder(sparse=False), cat_col)])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier())])

model.fit(X_train, y_train)

### Saving the model

In [None]:
import joblib

joblib.dump(model, 'model.joblib')

In [None]:
unique_values = {col:X_train[col].unique() for col in cat_col}
unique_values['education'] = list(EDU_DICT.keys())

joblib.dump(unique_values, 'unique_values.joblib')


In [None]:
unique_values

In [None]:
import sklearn

print(sklearn.__version__)