In [1]:
import pandas as pd
import json
import sys
import os
import pandas as pd
import numpy as np

In [2]:
class Column(object):
    """
    Column module containing functionality to convert feature values into written text

    name: String corresponding to the name of the column in the data set.
    attribute: String corresponging to the text description of the column.
    col_type: The type of the column: binary, categorical or numerical.
    verb: The verb required to conjugate the attribute.
    encode_fn: The function used to encode categorical values of this column.

    """
    def __init__(self, name, attribute=None, col_type=None, verb=None, encode_fn=None):
        self.name = name
        self.attribute = attribute
        self.type = col_type
        self.verb = verb
        self.encode_fn = encode_fn

    def is_binary(self):
        return self.type == "binary"

    def is_categorical(self):
        return self.type == "categorical"

    def is_numerical(self):
        return self.type == "numerical"

    def create_sentence(self, value, imp_value, prefix, missing_word, replace_numbers, descriptive):
        """
        Parameters::
            value: The value of this column at a specific data point
            imp_value: The imputed value of this column at a specific data point.
            prefix: String containing the desired prefix to add at the beginning of the sentence ("", "the Patient", etc.)
            missing_word: String describing how to handle missing values (e.g. "", "is missing" "imp_replace")
            replace_numbers: Boolean indicating weather or not to replace numerical values with text (e.g. very low, high, normal)
            descriptive: Boolean indicating weather or not the sentence should be descriptive.

        Returns::
            String with a sentence describing the column and its value.
            In the case of missing values:
                1. If missing_word == "" the sentence is the empty string
                2. If missing_word == "imp_replace" the sentence is constructed using the imputed value
                3. For all other cases the sentence is constructed using the text in the string missing_word

        """
        if descriptive:
            return self.create_descriptive_sentence(value, imp_value, prefix, missing_word, replace_numbers)
        else:
            return self.create_basic_sentence(value, imp_value, prefix, missing_word, replace_numbers)

In [3]:
class Binary_Column(Column):
    """
    Binary Column submodule for columns with values in [1, 0, true, false, "1", "0", "true", "false"]

    verb: The positive from of the verb used to conjugate the attribute when value is 1, "1" or "True"
    neg_verb: Negative form of the verb used to conjugate the attribute when value is 0, "0" or "false"

    """
    def __init__(self, name, attribute, verb, neg_verb, encode_fn=None):
        self.neg_verb = neg_verb
        super().__init__(name, attribute, "binary", verb, encode_fn)


    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        sentence = ""

        if str(value).lower()  in ["1", "0", "true", "false"]:

            if int(value) == 1:

                sentence = prefix + " " + self.verb + " " + self.attribute

            elif int(value) == 0:

                sentence = prefix + " " + self.neg_verb + " " + self.attribute

        return sentence


    def create_basic_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):

        sentence = ""

        if str(value).lower()  in ["1", "0", "true", "false"]:

            if int(value) == 1:

                sentence = self.verb + " " + self.attribute + ": yes"

            elif int(value) == 0:

                sentence = self.neg_verb + " " + self.attribute +" : no"

        elif missing_word != "":
            sentence = self.verb + " " + self.attribute + ": " + missing_word
        return sentence

In [4]:
class Categorical_Column(Column):
    """
    Categorical Column submodule for columns with non-numerical values

    """
    def __init__(self, name, attribute, verb, encode_fn=None):
        super().__init__(name, attribute, "categorical", verb, encode_fn)

    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = prefix + self.attribute + " " + self.verb + " " + str(value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = prefix  + self.attribute + " " + self.verb + " " + missing_word
        elif missing_word == "imp_replace":
            sentence = prefix + self.attribute + " " + self.verb + " " + str(imp_value)
        return sentence


    def create_basic_sentence(self, value, imp_value, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = self.attribute + ": " + str(value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = self.attribute + ": " + missing_word
        elif missing_word == "imp_replace":
            sentence = self.attribute + ": " + str(imp_value)
        return sentence

In [5]:
class Numerical_Column(Column):
    """
    Numerical Column submodule for columns with numerical values

    avg: The average of the values observed for this column (to be computed usign Training set)
    sd: The standard deviation of the values observed for this column (to be computed usign Training set)

    """
    def __init__(self, name, attribute, verb, avg, sd, encode_fn = None):
        self.avg = avg
        self.sd = sd
        super().__init__(name, attribute, "numerical", verb, encode_fn)


    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            value = float(value)
            col_value = self.encode_number(value, replace_numbers)
            sentence = prefix  + self.attribute + " " + self.verb + " " + str(col_value)
        elif  missing_word not in ["", "imp_replace"]:
            sentence = prefix  + self.attribute + " " + self.verb + " " + missing_word
        elif missing_word == "imp_replace":
            col_value = self.encode_number(imp_value, replace_numbers)
            sentence = prefix  + self.attribute + " " + self.verb + " " + str(col_value)
        return sentence


    def create_basic_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            value = float(value)
            col_value = self.encode_number(value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = self.attribute + ": " + missing_word
        elif missing_word == "imp_replace":
            col_value = self.encode_number(imp_value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        return sentence

    def encode_number(self, value, replace_numbers):
        new_value = value
        if replace_numbers:
            if self.avg - 2*self.sd > value:
                new_value = "very low"
            elif self.avg - 2*self.sd <= value < self.avg - self.sd:
                new_value = "low"
            elif self.avg + 2*self.sd >= value > self.avg + self.sd:
                new_value = "high"
            elif self.avg + 2*self.sd < value:
                new_value = "very high"
            else:
                new_value = "normal"
        return new_value

In [6]:
class Table(object):
    """
    Table module containing tabular information for a specific id.

    name: String corresponding to the name of the table.
    df: Dataframe containing the tabular data for a specific id.
    columns: List of Column objects corresponing to the columns in df.
    metadata: String containing metadata information about this table structure.
    time_col: Name of the column in df containing the timestamp for each observation.
    imputer: Function used to impute the missing values in df.

    """

    def __init__(self, name="", df=pd.DataFrame(), columns=None, metadata=None, imputer=None):

        self.name = name
        self.columns = columns
        self.metadata = metadata
        self.df = df
        self.is_empty = pd.isna(df).all().all()

    def create_encoded_imputed_vectors(self):
        """
        Creates encoded and imputed versions of the table contents.
        """
        encoded_df =  pd.DataFrame()

        for column in self.columns:
            col_values = self.df[column.name]
            col_encoder = column.encode_fn
            labels = col_encoder(col_values[col_values.notnull()])
            encoded_df[column.name] = col_values
            encoded_df[column.name] = pd.Series(labels, index=col_values[col_values.notnull()].index)

        self.encodings = encoded_df


    def create_text(self, prefix, missing_word, descriptive, meta, sep = "</s>"):
        """
        Creates a timestamped dataframe; each row contains a String (paragraph) with all the tabular information for the
        corresponding timestamp.

        Paramteres::
            prefix: String containing the desired prefix to add at the beginning of each sentence ("", "the Patient", etc.)
            missing_word: String describing how to handle missing values (e.g. "", "is missing" "imp_replace")
            replace_numbers: Boolean indicating weather or not to replace numerical values with text (e.g. very low, high, normal)
            descriptive: Boolean indicating weather or not each sentence should be descriptive.
            meta: Boolean indicating weather or not to include meta information in the paragraphs.
            sep: String indicating what symbol to use at the end of the paragraph as a separator between tables.
        """
        self.text = pd.DataFrame()
        text = []

        for t_i in range(self.df.shape[0]):

            text_i = ""

            if meta & (len(str(self.metadata)) >1):
                text_i = self.metadata

            for column in self.columns:

                value = self.df.iloc[t_i][column.name]

                imp_value = "Unkwown"
                col_text = column.create_sentence(value, imp_value, prefix, missing_word, descriptive=descriptive, replace_numbers=True)

                if len(col_text) >0:
                    col_text += ", "

                text_i += col_text

            text_i = text_i[:-2]+ ". " + sep
            text.append(text_i)

        self.text["text"] =  text

    def create_embeddings(self):
        """
        Creates a timestamped dataframe; each row contains NLP embeddings for the paragraph of the corresponding timestamp.
        """
        embeddings = []

        for i in range(self.text.shape[0]):

            text = self.text.iloc[i]["text"]

            full_embedding = get_biobert_embeddings(text)[0]

            embeddings.append(full_embedding.reshape(-1))

        emb_df =  pd.DataFrame(np.array(embeddings))
        emb_df = emb_df.set_index(self.text.index)

        merged_df = pd.concat([self.text, emb_df], axis=1)
        merged_df = merged_df.rename({i: self.name + "_" + str(i) for i in range(len(embeddings[0]))}, axis='columns')

        self.embeddings = merged_df.drop(["text"], axis = 1)

In [7]:
train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
test_df = pd.read_csv("data/census_income/adult.test", sep=', ')

dataset = pd.concat([train_df, test_df])

#NaN are flagged as "?"
dataset['workclass'] = dataset['workclass'].replace('?', np.nan)
dataset['occupation'] = dataset['occupation'].replace('?', np.nan)
dataset['native.country'] = dataset['native.country'].replace('?', np.nan)

  train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
  test_df = pd.read_csv("data/census_income/adult.test", sep=', ')


**Test: Categorical Column**

In [8]:
workclassColumn = Categorical_Column(name='workclass', attribute='Work Class', verb='is')

In [9]:
workclassColumn.create_descriptive_sentence(value=np.NaN, prefix="The", missing_word="imp_replace", imp_value="Unknown", replace_numbers=False)

"Th's Work Class is Unknown"

In [10]:
workclassColumn.create_basic_sentence(value=np.NaN, missing_word="imp_replace", imp_value="Unknown", replace_numbers=False)

'Work Class: Unknown'

**Test: Numerical Column**

In [11]:
ageColumn = Numerical_Column(name='age', attribute='Age', verb='is', avg=dataset['age'].mean(), sd=dataset['age'].std())

In [12]:
ageColumn.create_basic_sentence(value=dataset['age'].iloc[0], prefix="", missing_word="imp_replace", imp_value=dataset['age'].mean(), replace_numbers=False)

'Age: 39.0'

In [13]:
ageColumn.create_descriptive_sentence(value=dataset['age'].iloc[0], prefix="The", missing_word="imp_replace", imp_value=dataset['age'].mean(), replace_numbers=False)

"Th's Age is 39.0"

**Test: Binary Column**

In [14]:
dataset['sex_binary'] = np.where(dataset['sex']=='Male', 1, 0)

In [15]:
sexColumn = Binary_Column(name='sex_binary', attribute='Male', verb='is', neg_verb='is not')

In [16]:
sexColumn.create_basic_sentence(value=1, imp_value="", prefix="The gender", missing_word="imp_replace", replace_numbers=False)

'is Male: yes'

In [17]:
sexColumn.create_descriptive_sentence(value=1, imp_value="", prefix="The gender", missing_word="imp_replace", replace_numbers=False)

'The gender is Male'

**Test on whole table**

In [18]:
index = 0

row = dataset.iloc[index]

ageColumn = Numerical_Column(name='age', attribute='Age', verb='is', avg=dataset['age'].mean(), sd=dataset['age'].std())
workclassColumn = Categorical_Column(name='workclass', attribute='Workclass', verb='is')
educationColumn = Categorical_Column(name='education', attribute='Education', verb='is')
education_numColumn = Numerical_Column(name='education.num', attribute='Education Number', verb='is', avg=dataset['education.num'].mean(), sd=dataset['education.num'].std())
marital_statusColumn = Categorical_Column(name='marital.status', attribute='Marital Status', verb='is')
occupationColumn = Categorical_Column(name='occupation', attribute='Occupation', verb='is')
relationshipColumn = Categorical_Column(name='relationship', attribute='Relationship', verb='is')
raceColumn = Categorical_Column(name='race', attribute="Race", verb='is')
sexColumn = Categorical_Column(name='sex', attribute="Gender", verb='is')
capital_gainColumn = Numerical_Column(name='capital.gain', attribute="Capital Gain", verb='is', avg=dataset['capital.gain'].mean(), sd=dataset['capital.gain'].std())
capital_lossColumn = Numerical_Column(name='capital.loss', attribute="Capital Loss", verb='is', avg=dataset['capital.loss'].mean(), sd=dataset['capital.loss'].std())
hours_per_weekColumn = Numerical_Column(name='hours.per.week', attribute="Hours per Week", verb='is', avg=dataset['hours.per.week'].mean(), sd=dataset['hours.per.week'].std())
native_countryColumn = Categorical_Column(name='native.country', attribute="Native Country", verb='is')

censusTable = Table(name="census", 
                    df = dataset, 
                    columns=[ageColumn, workclassColumn, educationColumn, education_numColumn, 
                                                          marital_statusColumn, occupationColumn, relationshipColumn, raceColumn, 
                                                          sexColumn, capital_gainColumn, capital_lossColumn, hours_per_weekColumn, native_countryColumn], 
                    metadata="Census Income Dataset: ", 
                    imputer=None)

In [19]:
censusTable.create_text(prefix="The person ", missing_word="is missing", descriptive=True, meta=True, sep="</s>")
censusTable.text

Unnamed: 0,text
0,Census Income Dataset: The person's Age is nor...
1,Census Income Dataset: The person's Age is nor...
2,Census Income Dataset: The person's Age is nor...
3,Census Income Dataset: The person's Age is hig...
4,Census Income Dataset: The person's Age is nor...
...,...
48837,Census Income Dataset: The person's Age is nor...
48838,Census Income Dataset: The person's Age is hig...
48839,Census Income Dataset: The person's Age is nor...
48840,Census Income Dataset: The person's Age is nor...


In [23]:
censusTable.text.to_csv('data/census_income/census_income_sentences.csv', index=False)

In [22]:
first_string = censusTable.text.iloc[0]["text"]
first_string

"Census Income Dataset: The person's Age is normal, The person's Workclass is State-gov, The person's Education is Bachelors, The person's Education Number is high, The person's Marital Status is Never-married, The person's Occupation is Adm-clerical, The person's Relationship is Not-in-family, The person's Race is White, The person's Gender is Male, The person's Capital Gain is normal, The person's Capital Loss is normal, The person's Hours per Week is normal, The person's Native Country is United-States. </s>"

## **Model With Embeddings**

In [29]:
embeddings = pd.read_csv('data/embeddings.csv')

features_df = dataset.reset_index().drop(columns=["index"])

dataset = features_df.copy()

**Feature Engineering (same as in baseline)**

In [30]:
# Identify Numeric features
numeric_features = ['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week','income']
cat_features = ['workclass','education','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Sex
dataset["sex"] = dataset["sex"].map({"Male": 0, "Female":1})

# Marital Status
dataset["marital.status"] = dataset["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
dataset["marital.status"] = dataset["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
dataset["marital.status"] = dataset["marital.status"].map({"Married":1, "Single":0})
dataset["marital.status"] = dataset["marital.status"].astype(int)

# Education
dummies_ed = pd.get_dummies(dataset['education'], prefix='education')
dataset = pd.concat([dataset, dummies_ed], axis=1)
dataset = dataset.drop('education', axis=1)

# Workclass
dataset['workclass'] = dataset['workclass'].str.replace('?', 'Unemployed')   # Missing values
dummies_w = pd.get_dummies(dataset['workclass'], prefix='workclass')
dataset = pd.concat([dataset, dummies_w], axis=1)
dataset = dataset.drop('workclass', axis=1)

# Occupation
dataset['occupation'] = dataset['occupation'].str.replace('?', 'Unemployed')   # Missing values
dummies_o = pd.get_dummies(dataset['occupation'], prefix='occupation')
dataset = pd.concat([dataset, dummies_o], axis=1)
dataset = dataset.drop('occupation', axis=1)

# Race 
dummies_r = pd.get_dummies(dataset['race'], prefix='race')
dataset = pd.concat([dataset, dummies_r], axis=1)
dataset = dataset.drop('race', axis=1)

# Relationship
dummies_re = pd.get_dummies(dataset['relationship'], prefix='relationship')
dataset = pd.concat([dataset, dummies_re], axis=1)
dataset = dataset.drop('relationship', axis=1)

# Native Country and fnlwgt dropped
dataset.drop(labels=["native.country", "fnlwgt"], axis = 1, inplace = True)

# Convert to bool
for col in dataset.columns:
    if dataset[col].dtype == 'bool':
        dataset[col] = dataset[col].astype(int)

In [37]:
dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}).astype(int)

**Without PCA**

In [38]:
full_df = pd.concat([dataset, embeddings], axis=1)

In [39]:
full_df

Unnamed: 0,age,education.num,marital.status,sex,capital.gain,capital.loss,hours.per.week,income,sex_binary,education_10th,...,374,375,376,377,378,379,380,381,382,383
0,39,13,0,0,2174,0,40,0,1,0,...,0.001148,-0.017442,0.092456,0.095219,-0.013387,-0.073221,0.104762,0.014535,-0.044319,-0.010846
1,50,13,1,0,0,0,13,0,1,0,...,-0.001280,-0.027532,0.084070,0.084152,-0.028333,-0.059644,0.091943,-0.007732,-0.037505,0.001786
2,38,9,0,0,0,0,40,0,1,0,...,0.015330,-0.007651,0.108652,0.079497,-0.003023,-0.079991,0.116133,0.019735,-0.052635,-0.002933
3,53,7,1,0,0,0,40,0,1,0,...,0.005265,-0.012854,0.091463,0.088137,-0.013124,-0.064002,0.116273,0.003250,-0.054972,0.005653
4,28,13,1,1,0,0,40,0,0,0,...,0.016198,-0.006476,0.122350,0.102755,0.000571,-0.081216,0.088777,0.001682,-0.048179,-0.010504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,13,0,1,0,0,36,0,0,0,...,0.006846,-0.003409,0.089131,0.076167,-0.023011,-0.074823,0.100038,-0.001175,-0.049833,-0.001336
48838,64,9,0,0,0,0,40,0,1,0,...,0.007624,-0.009914,0.084471,0.066098,-0.018355,-0.085555,0.133181,-0.014142,-0.067975,0.013283
48839,38,13,1,0,0,0,50,0,1,0,...,0.014165,-0.006431,0.098493,0.085268,-0.014512,-0.074639,0.096582,0.011756,-0.050140,-0.006379
48840,44,13,0,0,5455,0,40,0,1,0,...,0.019287,-0.014969,0.098039,0.099007,-0.002364,-0.056246,0.102553,0.015269,-0.054333,-0.004718


**Models with no CV**

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import pickle

In [42]:
X_train, Y_train = full_df.iloc[:32561].drop('income', axis=1), full_df.iloc[:32561]['income']
X_test, Y_test = full_df.iloc[32561:].drop('income', axis=1), full_df.iloc[32561:]['income']

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('XGB', XGBClassifier()))

names = []
accuracies = []
aucs = []
precisions = []
recalls = []

for name, model in models:

    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    accuracy = accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)

    names.append(name)
    accuracies.append(accuracy)
    aucs.append(auc)
    precisions.append(precision_score(Y_test, predictions))
    recalls.append(recall_score(Y_test, predictions))

    msg = "%s: accuracy %f - AUC %f" % (name, accuracy, auc)
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR: accuracy 0.795467 - AUC 0.612250
KNN: accuracy 0.806154 - AUC 0.602095
CART: accuracy 0.783244 - AUC 0.705718
NB: accuracy 0.831951 - AUC 0.768404
RF: accuracy 0.815552 - AUC 0.710705
XGB: accuracy 0.855599 - AUC 0.758203


**Models with CV**

In [46]:
models = [
    #("LR", LogisticRegression()),
    #("KNN", KNeighborsClassifier()),
    #("CART", DecisionTreeClassifier()),
    #("NB", GaussianNB()),
    #("RF", RandomForestClassifier()),
    ("XGB", XGBClassifier()),
]

param_grids = {

    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
    },

    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },

    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },

    "GaussianNB": {"var_smoothing": [1e-9, 1e-8, 1e-7]},

    "RandomForestClassifier": {
        "n_estimators": [100, 200, 300],
        "max_features": [2, 3, 4],
        "max_depth": [None, 3, 5, 10],
    },

    "XGBClassifier": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    },

}

In [47]:
#cell runs in 2min15
skf = StratifiedKFold(n_splits=5)
searches = {}
SCORING_METRIC = "roc_auc"
for name, model in models:
    print(f"Running RandomizedSearchCV for {model.__class__.__name__}")
    search = RandomizedSearchCV(
        model,
        param_grids[model.__class__.__name__],
        cv=skf.split(X_train, Y_train),
        scoring=SCORING_METRIC,
        n_jobs=-1,
        )
    search.fit(X_train, Y_train)
    searches[name] = search

Running RandomizedSearchCV for XGBClassifier


In [48]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

In [49]:
# cell runs in 1min50
names = []
accuracies = []
aucs = []
precisions = []
recalls = []
best_thresholds = []
scores =[]
r2s = []
for name, search in searches.items():
    model = search.best_estimator_
    probas = model.predict_proba(X_train.values)[:, 1]
    best_threshold = find_best_threshold(Y_train, probas) # for f1

    # metrics on test set
    preds = (model.predict_proba(X_test.values)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    accuracy = accuracy_score(Y_test, preds)
    precision = precision_score(Y_test, preds)
    recall = recall_score(Y_test, preds)
    r2 = r2_score(Y_test, preds)

    names.append(name)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    r2s.append(r2)
    scores.append(search.best_score_)
    best_thresholds.append(best_threshold)

    msg = f"{name}: Best Threshold {best_threshold:.2f} - Accuracy {accuracy:.4f} - AUC {auc:.4f}"
    print(msg)

XGB: Best Threshold 0.40 - Accuracy 0.8625 - AUC 0.8115


In [50]:
metrics = pd.DataFrame(
    {
        #"Best threshold": best_thresholds,
        "Accuracy": accuracies,
        "AUC": aucs,
        "Precision": precisions,
        "Recall": recalls,
        "R2": r2s,
        "Training score": scores,
    },
    index=names,
).transpose()
metrics["Best Value"] = metrics.max(axis=1)
metrics["Best Model"] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,XGB,Best Value,Best Model
Accuracy,0.862478,0.862478,XGB
AUC,0.811466,0.811466,XGB
Precision,0.706502,0.706502,XGB
Recall,0.714769,0.714769,XGB
R2,0.23778,0.23778,XGB
Training score,0.921979,0.921979,XGB


**With PCA**

In [51]:
from sklearn.decomposition import PCA

In [52]:
pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(embeddings)

In [53]:
embeddings_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
full_df_pca = pd.concat([embeddings_df, dataset], axis=1)

In [62]:
from sklearn.preprocessing import StandardScaler

In [64]:
X_train, Y_train = full_df_pca.iloc[:32561].drop('income', axis=1), full_df_pca.iloc[:32561]['income']
X_test, Y_test = full_df_pca.iloc[32561:].drop('income', axis=1), full_df_pca.iloc[32561:]['income']

numeric_features = ['age','education.num','capital.gain','capital.loss','hours.per.week', 'x', 'y', 'z']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('XGB', XGBClassifier()))

names = []
accuracies = []
aucs = []
precisions = []
recalls = []

for name, model in models:

    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    accuracy = accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)

    names.append(name)
    accuracies.append(accuracy)
    aucs.append(auc)
    precisions.append(precision_score(Y_test, predictions))
    recalls.append(recall_score(Y_test, predictions))

    msg = "%s: accuracy %f - AUC %f" % (name, accuracy, auc)
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR: accuracy 0.855906 - AUC 0.771873
KNN: accuracy 0.827775 - AUC 0.740257
CART: accuracy 0.767643 - AUC 0.698738
NB: accuracy 0.580493 - AUC 0.702654
RF: accuracy 0.837172 - AUC 0.742907
XGB: accuracy 0.856274 - AUC 0.753078


**With CV**

In [65]:
models = [
    #("LR", LogisticRegression()),
    #("KNN", KNeighborsClassifier()),
    #("CART", DecisionTreeClassifier()),
    #("NB", GaussianNB()),
    #("RF", RandomForestClassifier()),
    ("XGB", XGBClassifier()),
]

param_grids = {

    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
    },

    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },

    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },

    "GaussianNB": {"var_smoothing": [1e-9, 1e-8, 1e-7]},

    "RandomForestClassifier": {
        "n_estimators": [100, 200, 300],
        "max_features": [2, 3, 4],
        "max_depth": [None, 3, 5, 10],
    },

    "XGBClassifier": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    },

}

In [66]:
#cell runs in 2min15
skf = StratifiedKFold(n_splits=5)
searches = {}
SCORING_METRIC = "roc_auc"
for name, model in models:
    print(f"Running RandomizedSearchCV for {model.__class__.__name__}")
    search = RandomizedSearchCV(
        model,
        param_grids[model.__class__.__name__],
        cv=skf.split(X_train, Y_train),
        scoring=SCORING_METRIC,
        n_jobs=-1,
        )
    search.fit(X_train, Y_train)
    searches[name] = search

Running RandomizedSearchCV for XGBClassifier


In [67]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

In [68]:
# cell runs in 1min50
names = []
accuracies = []
aucs = []
precisions = []
recalls = []
best_thresholds = []
scores =[]
r2s = []
for name, search in searches.items():
    model = search.best_estimator_
    probas = model.predict_proba(X_train.values)[:, 1]
    best_threshold = find_best_threshold(Y_train, probas) # for f1

    # metrics on test set
    preds = (model.predict_proba(X_test.values)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    accuracy = accuracy_score(Y_test, preds)
    precision = precision_score(Y_test, preds)
    recall = recall_score(Y_test, preds)
    r2 = r2_score(Y_test, preds)

    names.append(name)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    r2s.append(r2)
    scores.append(search.best_score_)
    best_thresholds.append(best_threshold)

    msg = f"{name}: Best Threshold {best_threshold:.2f} - Accuracy {accuracy:.4f} - AUC {auc:.4f}"
    print(msg)

XGB: Best Threshold 0.37 - Accuracy 0.8640 - AUC 0.8219


In [69]:
metrics = pd.DataFrame(
    {
        #"Best threshold": best_thresholds,
        "Accuracy": accuracies,
        "AUC": aucs,
        "Precision": precisions,
        "Recall": recalls,
        "R2": r2s,
        "Training score": scores,
    },
    index=names,
).transpose()
metrics["Best Value"] = metrics.max(axis=1)
metrics["Best Model"] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,XGB,Best Value,Best Model
Accuracy,0.864013,0.864013,XGB
AUC,0.821899,0.821899,XGB
Precision,0.700196,0.700196,XGB
Recall,0.74207,0.74207,XGB
R2,0.246291,0.246291,XGB
Training score,0.925893,0.925893,XGB
