In [1]:
# Copying Code from main.py

In [1]:
import torch
import sklearn
import numpy as np
import pandas as pd
from robust_losses import RobustLoss
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from torch.utils.data import Dataset
from glob import glob
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
class Model(nn.Module):

    def __init__(self, embedding_sizes, n_continuous, p_dropout = 0.4):
        super().__init__()

        # embedding layers
        self.embeddings = nn.ModuleList([nn.Embedding(num_embeddings, embedding_dim) for num_embeddings, embedding_dim in embedding_sizes])
        self.embeddings_dropout = nn.Dropout(p_dropout)
        
        # normalize continuous
        self.normalize_continuous = nn.BatchNorm1d(n_continuous)

        # linear FC layers
        number_embeddings = sum([number_embeddings_per_column for _, number_embeddings_per_column in embedding_sizes])
        in_features = number_embeddings + n_continuous
        layer_list = [
            nn.Linear(in_features=in_features, out_features=64),
            nn.ReLU(inplace=True),
            nn.Dropout(p=p_dropout),
            nn.BatchNorm1d(64),

            nn.Linear(in_features=64, out_features=128),
            nn.ReLU(inplace=True),
            nn.Dropout(p=p_dropout),
            nn.BatchNorm1d(128),

            nn.Linear(in_features=128, out_features=32),
            nn.ReLU(inplace=True),
            nn.Dropout(p=p_dropout),
            nn.BatchNorm1d(32),
            
            nn.Linear(in_features=32, out_features=2),
        ]
        self.layers = nn.Sequential(*layer_list)

    def forward(self, x_categorical, x_continuous):

        x_categorical_tensor = x_categorical[0].int()
        x_continuous_tensor = x_continuous[0]

        # process embeddings
        embeddings = []
        for i, embedding_layer in enumerate(self.embeddings):
            embeddings.append(embedding_layer(x_categorical_tensor[:, i]))
        x_cat = torch.cat(embeddings, dim=1)
        x_cat = self.embeddings_dropout(x_cat)

        # process continuous
        x_cont = self.normalize_continuous(x_continuous_tensor)

        # concatenate all inputs
        x = torch.cat([x_cat, x_cont], dim=1)

        # apply layers
        return self.layers(x)

In [None]:
def testaccuracy(model, test_categorical_dataloader, test_continuous_dataloader, test_target_dataloader):
    with torch.no_grad():
        y_pred = None
        y_test = None
        for i, (x_cat, x_cont, y) in enumerate(zip(test_categorical_dataloader, test_continuous_dataloader, test_target_dataloader)):
            y_pred = F.softmax(model(x_cat, x_cont))
            y_test = y
        print(y)
        y_pred_class=y_pred.round()
        accuracy=(y_pred_class.eq(y_test).sum())/float(y_test.shape[0])
        return (accuracy.item())

In [25]:
def saveModel(model):
    fname = "DRO_model.pth"
    torch.save(model.state_dict(), fname)

In [26]:
path = '../data/datasets/uci_adult/'

In [27]:
synthfols = glob("../data/datasets/uci_adult/synthetic/*/")

In [28]:
paths = [path]
# paths.extend(synthfols)

In [52]:
for p in paths:
    
#     if os.path.exists(path+'preds/DRO_pred.pt') == False:

    train_df = pd.read_csv(path+'train.csv',header=None)

    train_df.columns = ['age', 'workclass', 'fnlwgt','education', 'education.num', 'marital.status',
           'occupation', 'relationship', 'race', 'sex', 'capital.gain',
           'capital.loss', 'hours.per.week', 'native.country','income']

    train_df.index = train_df['sex']

    train_df = train_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)


    test_df = pd.read_csv(path+'test.csv',header=None)

    test_df.columns = ['age', 'workclass', 'fnlwgt','education', 'education.num', 'marital.status',
           'occupation', 'relationship', 'race', 'sex', 'capital.gain',
           'capital.loss', 'hours.per.week', 'native.country','income']

    test_df.index = test_df['sex']
    
    cats = ['workclass', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
    conts = ['age', 'fnlwgt', 'capital.gain', 'capital.loss', 'hours.per.week']
    target = 'income'
    
    df = train_df
    
    for cont in conts:
        df[cont] = df[cont].astype('float64')
    
    df['workclass'] = df['workclass'].apply(lambda x: x if x not in ['?', 'Without-pay', 'Never-worked'] else 'Without-pay')
    
    df['marital.status'] = df['marital.status'].apply(lambda x: x if x not in ['Married-AF-spouse', 'Married-spouse-absent'] else 'Married-spouse-absent')
    
    df['native.country'] = df['native.country'].apply(lambda x: x if x == 'United-States' else 'other')
    
    df[target] = df[target].astype('category')
    
    df = df.astype({x: 'category' for x in cats})
    for col in cats:
        df[col] = df[col].cat.codes.values

    df[target] = df[target].cat.codes.values
    
    for cat in cats:
        df[cat] = df[cat].astype('int32')
    
    df.drop('education', inplace=True, axis=1)
    
    train_df = df
        
    df = test_df
    
    for cont in conts:
        df[cont] = df[cont].astype('float64')
    
    
    df['workclass'] = df['workclass'].apply(lambda x: x if x not in ['?', 'Without-pay', 'Never-worked'] else 'Without-pay')
    
    df['marital.status'] = df['marital.status'].apply(lambda x: x if x not in ['Married-AF-spouse', 'Married-spouse-absent'] else 'Married-spouse-absent')
    
    df['native.country'] = df['native.country'].apply(lambda x: x if x == 'United-States' else 'other')
    
    df[target] = df[target].astype('category')
    
    df = df.astype({x: 'category' for x in cats})
    for col in cats:
        df[col] = df[col].cat.codes.values

    df[target] = df[target].cat.codes.values
    
    df.drop('education', inplace=True, axis=1)
    
    test_df = df
    
    X_train = train_df.drop(['income'],axis=1)
    X_test = test_df.drop(['income'],axis=1)
    
    y_train = train_df['income']
    y_test = test_df['income']
    
    age_max = X_train['age'].max()
    age_min = X_train['age'].min()

    X_train['age'] = X_train['age'].apply(lambda x: (x - age_min) / (age_max - age_min))
    
    fnlwgt_max = X_train['fnlwgt'].max()
    fnlwgt_min = X_train['fnlwgt'].min()

    X_train['fnlwgt'] = X_train['fnlwgt'].apply(lambda x: (x - fnlwgt_min) / (fnlwgt_max - fnlwgt_min))
    
    capital_max = X_train['capital.gain'].max()
    capital_min = X_train['capital.gain'].min()

    X_train['capital.gain'] = X_train['capital.gain'].apply(lambda x: (x - capital_min) / (capital_max - capital_min))
    
    capital_loss_max = X_train['capital.loss'].max()
    capital_loss_min = X_train['capital.loss'].min()

    X_train['capital.loss'] = X_train['capital.loss'].apply(lambda x: (x - capital_loss_min) / (capital_loss_max - capital_loss_min))
    
    hours_per_week_max = X_train['hours.per.week'].max()
    hours_per_week_min = X_train['hours.per.week'].min()

    X_train['hours.per.week'] = X_train['hours.per.week'].apply(lambda x: (x - hours_per_week_min) / (hours_per_week_max - hours_per_week_min))
    
    X_test['age'] = X_test['age'].apply(lambda x: (x - age_min) / (age_max - age_min))
    X_test['fnlwgt'] = X_test['fnlwgt'].apply(lambda x: (x - fnlwgt_min) / (fnlwgt_max - fnlwgt_min))
    X_test['capital.gain'] = X_test['capital.gain'].apply(lambda x: (x - capital_min) / (capital_max - capital_min))
    X_test['capital.loss'] = X_test['capital.loss'].apply(lambda x: (x - capital_loss_min) / (capital_loss_max - capital_loss_min))
    X_test['hours.per.week'] = X_test['hours.per.week'].apply(lambda x: (x - hours_per_week_min) / (hours_per_week_max - hours_per_week_min))
    
    for cat in cats:
        X_train[cat] = X_train[cat].astype('int32')
        X_test[cat] = X_test[cat].astype('int32')
    
    batch_size = 256

    train_categorical_dataloader = DataLoader(
        TensorDataset(
            torch.Tensor(X_train[cats].to_numpy())
        ),
        batch_size=batch_size
    )
    train_continuous_dataloader = DataLoader(
        TensorDataset(
            torch.Tensor(X_train[conts].to_numpy())
        ),
        batch_size=batch_size
    )
    train_target_dataloader = DataLoader(
        TensorDataset(
            torch.Tensor(y_train.to_numpy())
        ),
        batch_size=batch_size
    )

    test_categorical_dataloader = DataLoader(
        TensorDataset(
            torch.Tensor(X_test[cats].to_numpy())
        ),
        batch_size=batch_size
    )
    test_continuous_dataloader = DataLoader(
        TensorDataset(
            torch.Tensor(X_test[conts].to_numpy())
        ),
        batch_size=batch_size
    )
    test_target_dataloader = DataLoader(
        TensorDataset(
            torch.Tensor(y_test.to_numpy())
        ),
        batch_size=batch_size
    )
    
    embeddings_sizes = []

    for cat in cats:
        categories = len(df[cat].unique())
        embedding_size = (categories + 1) // 2
        embeddings_sizes.append((categories, embedding_size))    
    
    model = Model(embedding_sizes=embeddings_sizes, n_continuous=len(conts))  
    
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0006)
    criterion = nn.CrossEntropyLoss(reduction='none')
    robust_loss = RobustLoss(geometry='chi-square', size=1.0, reg=0.5)
    
    epochs = 100
    losses = []
    best_accuracy = 0.0


    for epoch in range(epochs):

        for i, (x_cat, x_cont, y) in enumerate(zip(train_categorical_dataloader, train_continuous_dataloader, train_target_dataloader)):

            # convert list to tensor
            y = y[0].float().reshape(-1, 1)

            y_pred = model(x_cat, x_cont)
            loss = robust_loss(criterion(y_pred, y.flatten().long()))

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        accuracy = testaccuracy(model,test_categorical_dataloader,test_continuous_dataloader,test_target_dataloader)
        print('accuracy:', accuracy)
        print('best:', best_accuracy)
        if accuracy > best_accuracy:
            saveModel(model)
            best_accuracy = accuracy

        losses.append(loss.item())
        print(f"epoch: {epoch+1}      loss: {loss}")
        
    finalmodel = Model(embedding_sizes=embeddings_sizes, n_continuous=len(conts))  
    finalmodel.load_state_dict(torch.load('DRO_model.pth'))

    with torch.no_grad():
        y_pred = None
        for i, (x_cat, x_cont, y) in enumerate(zip(test_categorical_dataloader, test_continuous_dataloader, test_target_dataloader)):
            y_pred = finalmodel(x_cat, x_cont)
        print(y_pred)
#         y_pred_class=y_pred.round()
#         try:
#             os.mkdir(path+'preds/')
#         except:
#             pass
#         torch.save(y_pred_class,path+'preds/DRO_pred.pt')



[tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1.,
        0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
        1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1.])]


AttributeError: 'list' object has no attribute 'round'