In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

ConnectionError: Error connecting to server

# Analyzing the structure of the dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
X.shape

(48842, 14)

In [4]:
X.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [5]:
X.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

##### Determining the one hot encoding the categorical variables

In [6]:
columns_to_one_hot = []
columns_to_scale = []
for tup in zip(X.dtypes.index, X.dtypes):
    if tup[1] == np.dtype('O'):
        columns_to_one_hot.append(tup[0])
    else:
        columns_to_scale.append(tup[0])

# Preprocessing

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [8]:
# one hot encoding
df1 = OneHotEncoder(sparse_output=False).fit_transform(X[columns_to_one_hot])

# standard scaling
df2 = StandardScaler().fit_transform(X[columns_to_scale])

In [9]:
# getting the binary labels for y
binary_y = []
for inc in y['income']:
    if inc == '<=50K':
        binary_y.append(-1)
    else:
        binary_y.append(1)
binary_y = np.array(binary_y).reshape(-1, 1)

In [10]:
num_labels = {}
for val in binary_y:
    num_labels[val[0]] = num_labels.get(val[0], 0) + 1

In [11]:
# fairly even positive/negative split
num_labels

{-1: 24720, 1: 24122}

In [12]:
df2.shape

(48842, 6)

In [13]:
final_dataset = np.hstack((df1, df2, binary_y))

In [14]:
final_dataset.shape

(48842, 112)

In [16]:
# saving the dataset
# np.save("adult_dataset.npy", final_dataset)

#### Sanity Check; quick logistic regression model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
np.random.seed(1) # Set the random seed.
np.random.shuffle(final_dataset)      # Shuffle the data points in X_and_Y array

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_dataset[:, 0:-1], final_dataset[:, -1], test_size=0.2, random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred) # better than random guessing!

#### Curious about base model performance without one hot encoding

In [None]:
curiosity = np.hstack((df2, binary_y))
np.random.shuffle(curiosity)
X_train, X_test, y_train, y_test = train_test_split(curiosity[:, 0:-1], curiosity[:, -1], test_size=0.2, random_state=1)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
accuracy_score(y_test, y_pred) # it's worse.. which is expected

#### More sanity checking

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# Perform 5-fold cross-validation
scores = cross_val_score(model, final_dataset[:, 0:-1], final_dataset[:, -1], cv=5)

# Print the accuracy scores for each fold
print(f'Cross-validation scores: {scores}')

# Print the average accuracy
print(f'Average cross-validation score: {scores.mean()}')