In [23]:
import pandas as pd
import json
import sys
import os
import pandas as pd
import numpy as np

In [24]:
class Column(object):
    """
    Column module containing functionality to convert feature values into written text

    name: String corresponding to the name of the column in the data set.
    attribute: String corresponging to the text description of the column.
    col_type: The type of the column: binary, categorical or numerical.
    verb: The verb required to conjugate the attribute.
    encode_fn: The function used to encode categorical values of this column.

    """
    def __init__(self, name, attribute=None, col_type=None, verb=None, encode_fn=None):
        self.name = name
        self.attribute = attribute
        self.type = col_type
        self.verb = verb
        self.encode_fn = encode_fn

    def is_binary(self):
        return self.type == "binary"

    def is_categorical(self):
        return self.type == "categorical"

    def is_numerical(self):
        return self.type == "numerical"

    def create_sentence(self, value, imp_value, prefix, missing_word, replace_numbers, descriptive):
        """
        Parameters::
            value: The value of this column at a specific data point
            imp_value: The imputed value of this column at a specific data point.
            prefix: String containing the desired prefix to add at the beginning of the sentence ("", "the Patient", etc.)
            missing_word: String describing how to handle missing values (e.g. "", "is missing" "imp_replace")
            replace_numbers: Boolean indicating weather or not to replace numerical values with text (e.g. very low, high, normal)
            descriptive: Boolean indicating weather or not the sentence should be descriptive.

        Returns::
            String with a sentence describing the column and its value.
            In the case of missing values:
                1. If missing_word == "" the sentence is the empty string
                2. If missing_word == "imp_replace" the sentence is constructed using the imputed value
                3. For all other cases the sentence is constructed using the text in the string missing_word

        """
        if descriptive:
            return self.create_descriptive_sentence(value, imp_value, prefix, missing_word, replace_numbers)
        else:
            return self.create_basic_sentence(value, imp_value, prefix, missing_word, replace_numbers)

In [25]:
class Binary_Column(Column):
    """
    Binary Column submodule for columns with values in [1, 0, true, false, "1", "0", "true", "false"]

    verb: The positive from of the verb used to conjugate the attribute when value is 1, "1" or "True"
    neg_verb: Negative form of the verb used to conjugate the attribute when value is 0, "0" or "false"

    """
    def __init__(self, name, attribute, verb, neg_verb, encode_fn=None):
        self.neg_verb = neg_verb
        super().__init__(name, attribute, "binary", verb, encode_fn)


    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        sentence = ""

        if str(value).lower()  in ["1", "0", "true", "false"]:

            if int(value) == 1:

                sentence = prefix + " " + self.verb + " " + self.attribute

            elif int(value) == 0:

                sentence = prefix + " " + self.neg_verb + " " + self.attribute

        return sentence


    def create_basic_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):

        sentence = ""

        if str(value).lower()  in ["1", "0", "true", "false"]:

            if int(value) == 1:

                sentence = self.verb + " " + self.attribute + ": yes"

            elif int(value) == 0:

                sentence = self.neg_verb + " " + self.attribute +" : no"

        elif missing_word != "":
            sentence = self.verb + " " + self.attribute + ": " + missing_word
        return sentence

In [26]:
class Categorical_Column(Column):
    """
    Categorical Column submodule for columns with non-numerical values

    """
    def __init__(self, name, attribute, verb, encode_fn=None):
        super().__init__(name, attribute, "categorical", verb, encode_fn)

    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = prefix + self.attribute + " " + self.verb + " " + str(value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = prefix  + self.attribute + " " + self.verb + " " + missing_word
        elif missing_word == "imp_replace":
            sentence = prefix + self.attribute + " " + self.verb + " " + str(imp_value)
        return sentence


    def create_basic_sentence(self, value, imp_value, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = self.attribute + ": " + str(value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = self.attribute + ": " + missing_word
        elif missing_word == "imp_replace":
            sentence = self.attribute + ": " + str(imp_value)
        return sentence

In [27]:
class Numerical_Column(Column):
    """
    Numerical Column submodule for columns with numerical values

    avg: The average of the values observed for this column (to be computed usign Training set)
    sd: The standard deviation of the values observed for this column (to be computed usign Training set)

    """
    def __init__(self, name, attribute, verb, avg, sd, encode_fn = None):
        self.avg = avg
        self.sd = sd
        super().__init__(name, attribute, "numerical", verb, encode_fn)


    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            value = float(value)
            col_value = self.encode_number(value, replace_numbers)
            sentence = prefix  + self.attribute + " " + self.verb + " " + str(col_value)
        elif  missing_word not in ["", "imp_replace"]:
            sentence = prefix  + self.attribute + " " + self.verb + " " + missing_word
        elif missing_word == "imp_replace":
            col_value = self.encode_number(imp_value, replace_numbers)
            sentence = prefix  + self.attribute + " " + self.verb + " " + str(col_value)
        return sentence


    def create_basic_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            value = float(value)
            col_value = self.encode_number(value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = self.attribute + ": " + missing_word
        elif missing_word == "imp_replace":
            col_value = self.encode_number(imp_value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        return sentence

    def encode_number(self, value, replace_numbers):
        new_value = value
        if replace_numbers:
            if self.avg - 2*self.sd > value:
                new_value = "very low"
            elif self.avg - 2*self.sd <= value < self.avg - self.sd:
                new_value = "low"
            elif self.avg + 2*self.sd >= value > self.avg + self.sd:
                new_value = "high"
            elif self.avg + 2*self.sd < value:
                new_value = "very high"
            else:
                new_value = "normal"
        return new_value

In [28]:
class Table(object):
    """
    Table module containing tabular information for a specific id.

    name: String corresponding to the name of the table.
    df: Dataframe containing the tabular data for a specific id.
    columns: List of Column objects corresponing to the columns in df.
    metadata: String containing metadata information about this table structure.
    time_col: Name of the column in df containing the timestamp for each observation.
    imputer: Function used to impute the missing values in df.

    """

    def __init__(self, name="", df=pd.DataFrame(), columns=None, metadata=None, imputer=None):

        self.name = name
        self.columns = columns
        self.metadata = metadata
        self.df = df
        self.is_empty = pd.isna(df).all().all()

    def create_encoded_imputed_vectors(self):
        """
        Creates encoded and imputed versions of the table contents.
        """
        encoded_df =  pd.DataFrame()

        for column in self.columns:
            col_values = self.df[column.name]
            col_encoder = column.encode_fn
            labels = col_encoder(col_values[col_values.notnull()])
            encoded_df[column.name] = col_values
            encoded_df[column.name] = pd.Series(labels, index=col_values[col_values.notnull()].index)

        self.encodings = encoded_df


    def create_text(self, prefix, missing_word, descriptive, meta, sep = "</s>"):
        """
        Creates a timestamped dataframe; each row contains a String (paragraph) with all the tabular information for the
        corresponding timestamp.

        Paramteres::
            prefix: String containing the desired prefix to add at the beginning of each sentence ("", "the Patient", etc.)
            missing_word: String describing how to handle missing values (e.g. "", "is missing" "imp_replace")
            replace_numbers: Boolean indicating weather or not to replace numerical values with text (e.g. very low, high, normal)
            descriptive: Boolean indicating weather or not each sentence should be descriptive.
            meta: Boolean indicating weather or not to include meta information in the paragraphs.
            sep: String indicating what symbol to use at the end of the paragraph as a separator between tables.
        """
        self.text = pd.DataFrame()
        text = []

        for t_i in range(self.df.shape[0]):

            text_i = ""

            if meta & (len(str(self.metadata)) >1):
                text_i = self.metadata

            for column in self.columns:

                value = self.df.iloc[t_i][column.name]

                imp_value = "Unkwown"
                col_text = column.create_sentence(value, imp_value, prefix, missing_word, descriptive=descriptive, replace_numbers=True)

                if len(col_text) >0:
                    col_text += ", "

                text_i += col_text

            text_i = text_i[:-2]+ ". " + sep
            text.append(text_i)

        self.text["text"] =  text

    def create_embeddings(self):
        """
        Creates a timestamped dataframe; each row contains NLP embeddings for the paragraph of the corresponding timestamp.
        """
        embeddings = []

        for i in range(self.text.shape[0]):

            text = self.text.iloc[i]["text"]

            full_embedding = get_biobert_embeddings(text)[0]

            embeddings.append(full_embedding.reshape(-1))

        emb_df =  pd.DataFrame(np.array(embeddings))
        emb_df = emb_df.set_index(self.text.index)

        merged_df = pd.concat([self.text, emb_df], axis=1)
        merged_df = merged_df.rename({i: self.name + "_" + str(i) for i in range(len(embeddings[0]))}, axis='columns')

        self.embeddings = merged_df.drop(["text"], axis = 1)

In [29]:
train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
test_df = pd.read_csv("data/census_income/adult.test", sep=', ')

dataset = pd.concat([train_df, test_df])

#NaN are flagged as "?"
dataset['workclass'] = dataset['workclass'].replace('?', np.nan)
dataset['occupation'] = dataset['occupation'].replace('?', np.nan)
dataset['native.country'] = dataset['native.country'].replace('?', np.nan)

  train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
  test_df = pd.read_csv("data/census_income/adult.test", sep=', ')


**Test: Categorical Column**

In [89]:
workclassColumn = Categorical_Column(name='workclass', attribute='Work Class', verb='is')

In [90]:
workclassColumn.create_descriptive_sentence(value=np.NaN, prefix="The", missing_word="imp_replace", imp_value="Unknown", replace_numbers=False)

"Th's Work Class is Unknown"

In [91]:
workclassColumn.create_basic_sentence(value=np.NaN, missing_word="imp_replace", imp_value="Unknown", replace_numbers=False)

'Work Class: Unknown'

**Test: Numerical Column**

In [11]:
ageColumn = Numerical_Column(name='age', attribute='Age', verb='is', avg=dataset['age'].mean(), sd=dataset['age'].std())

In [12]:
ageColumn.create_basic_sentence(value=dataset['age'].iloc[0], prefix="", missing_word="imp_replace", imp_value=dataset['age'].mean(), replace_numbers=False)

'Age: 39.0'

In [13]:
ageColumn.create_descriptive_sentence(value=dataset['age'].iloc[0], prefix="The", missing_word="imp_replace", imp_value=dataset['age'].mean(), replace_numbers=False)

"Th's Age is 39.0"

**Test: Binary Column**

In [14]:
dataset['sex_binary'] = np.where(dataset['sex']=='Male', 1, 0)

In [15]:
sexColumn = Binary_Column(name='sex_binary', attribute='Male', verb='is', neg_verb='is not')

In [16]:
sexColumn.create_basic_sentence(value=1, imp_value="", prefix="The gender", missing_word="imp_replace", replace_numbers=False)

'is Male: yes'

In [17]:
sexColumn.create_descriptive_sentence(value=1, imp_value="", prefix="The gender", missing_word="imp_replace", replace_numbers=False)

'The gender is Male'

**Test on whole table**

In [18]:
index = 0

row = dataset.iloc[index]

ageColumn = Numerical_Column(name='age', attribute='Age', verb='is', avg=dataset['age'].mean(), sd=dataset['age'].std())
workclassColumn = Categorical_Column(name='workclass', attribute='Workclass', verb='is')
educationColumn = Categorical_Column(name='education', attribute='Education', verb='is')
education_numColumn = Numerical_Column(name='education.num', attribute='Education Number', verb='is', avg=dataset['education.num'].mean(), sd=dataset['education.num'].std())
marital_statusColumn = Categorical_Column(name='marital.status', attribute='Marital Status', verb='is')
occupationColumn = Categorical_Column(name='occupation', attribute='Occupation', verb='is')
relationshipColumn = Categorical_Column(name='relationship', attribute='Relationship', verb='is')
raceColumn = Categorical_Column(name='race', attribute="Race", verb='is')
sexColumn = Categorical_Column(name='sex', attribute="Gender", verb='is')
capital_gainColumn = Numerical_Column(name='capital.gain', attribute="Capital Gain", verb='is', avg=dataset['capital.gain'].mean(), sd=dataset['capital.gain'].std())
capital_lossColumn = Numerical_Column(name='capital.loss', attribute="Capital Loss", verb='is', avg=dataset['capital.loss'].mean(), sd=dataset['capital.loss'].std())
hours_per_weekColumn = Numerical_Column(name='hours.per.week', attribute="Hours per Week", verb='is', avg=dataset['hours.per.week'].mean(), sd=dataset['hours.per.week'].std())
native_countryColumn = Categorical_Column(name='native.country', attribute="Native Country", verb='is')

censusTable = Table(name="census", 
                    df = dataset, 
                    columns=[ageColumn, workclassColumn, educationColumn, education_numColumn, 
                                                          marital_statusColumn, occupationColumn, relationshipColumn, raceColumn, 
                                                          sexColumn, capital_gainColumn, capital_lossColumn, hours_per_weekColumn, native_countryColumn], 
                    metadata="Census Income Dataset: ", 
                    imputer=None)

In [19]:
censusTable.create_text(prefix="The person ", missing_word="is missing", descriptive=True, meta=True, sep="</s>")
censusTable.text

Unnamed: 0,text
0,Census Income Dataset: The person's Age is nor...
1,Census Income Dataset: The person's Age is nor...
2,Census Income Dataset: The person's Age is nor...
3,Census Income Dataset: The person's Age is hig...
4,Census Income Dataset: The person's Age is nor...
...,...
48837,Census Income Dataset: The person's Age is nor...
48838,Census Income Dataset: The person's Age is hig...
48839,Census Income Dataset: The person's Age is nor...
48840,Census Income Dataset: The person's Age is nor...


In [23]:
censusTable.text.to_csv('data/census_income/census_income_sentences.csv', index=False)

In [22]:
first_string = censusTable.text.iloc[0]["text"]
first_string

"Census Income Dataset: The person's Age is normal, The person's Workclass is State-gov, The person's Education is Bachelors, The person's Education Number is high, The person's Marital Status is Never-married, The person's Occupation is Adm-clerical, The person's Relationship is Not-in-family, The person's Race is White, The person's Gender is Male, The person's Capital Gain is normal, The person's Capital Loss is normal, The person's Hours per Week is normal, The person's Native Country is United-States. </s>"

**New Sentence generation**

In [30]:
# Maps for column values to more understandable terms
workclass_map = {
    'Private': 'private sector',
    'Self-emp-not-inc': 'self-employed without incorporation',
    'Local-gov': 'local government',
    'State-gov': 'state government',
    'Self-emp-inc': 'self-employed with incorporation',
    'Federal-gov': 'federal government',
    'Without-pay': 'without pay',
    'Never-worked': 'never worked'
}

occupation_map = {
    'Adm-clerical': 'administrative clerical',
    'Exec-managerial': 'executive managerial',
    'Handlers-cleaners': 'handlers and cleaners',
    'Prof-specialty': 'professional specialty',
    'Other-service': 'other services',
    'Sales': 'sales',
    'Craft-repair': 'craft repair',
    'Transport-moving': 'transport moving',
    'Farming-fishing': 'farming and fishing',
    'Machine-op-inspct': 'machine operator inspector',
    'Tech-support': 'technical support',
    'Protective-serv': 'protective service',
    'Armed-Forces': 'armed forces',
    'Priv-house-serv': 'private house service'
}

education_map = {
    'Bachelors': 'Bachelor’s degree',
    'Some-college': 'some college education',
    '11th': '11th grade',
    'HS-grad': 'high school graduate',
    'Prof-school': 'professional school',
    'Assoc-acdm': 'associate’s degree (academic program)',
    'Assoc-voc': 'associate’s degree (vocational program)',
    '9th': '9th grade',
    '7th-8th': '7th to 8th grade',
    '12th': '12th grade',
    'Masters': 'Master’s degree',
    '1st-4th': '1st to 4th grade',
    '10th': '10th grade',
    'Doctorate': 'Doctorate degree',
    '5th-6th': '5th to 6th grade',
    'Preschool': 'preschool'
}

marital_status_map = {
    'Married-civ-spouse': 'married to a civilian spouse',
    'Divorced': 'divorced',
    'Never-married': 'not married',
    'Separated': 'separated',
    'Widowed': 'widowed',
    'Married-spouse-absent': 'married with spouse absent',
    'Married-AF-spouse': 'married to a spouse in the Armed Forces'
}

In [31]:
dataset['relationship'].unique()

array(['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)

In [32]:
def generate_sentence(row):
    row = row.fillna('not specified')
    
    pronoun = 'he' if row['sex'] == 'Male' else 'she'
    pronoun_two = 'his' if row['sex'] == 'Male' else 'her'
    
    # Apply mappings to the row values
    workclass_desc = workclass_map.get(row['workclass'], 'not specified')
    occupation_desc = occupation_map.get(row['occupation'], 'not specified').replace('-', ' ').title()
    education_desc = education_map.get(row['education'], 'not specified')
    marital_status_desc = marital_status_map.get(row['marital.status'], 'not specified')
    
    # Define the sentence format
    sentence = (
        f"The following is the information gathered about a person from the census income survey: "
        f"A {row['age']}-year-old {row['sex']}, born in {row['native.country']}, "
        f"is currently {marital_status_desc} and works as an {occupation_desc} "
        f"with a work class of type '{workclass_desc}'. {pronoun.capitalize()} holds a {education_desc}, "
        f"which corresponds to {row['education.num']} years of education. "
        f"{pronoun_two.capitalize()} relationship status within the family is '{row['relationship'].lower()}', "
        f"and {pronoun_two} race is {row['race']}. {pronoun.capitalize()} has had a capital gain of ${row['capital.gain']} "
        f"and a capital loss of ${row['capital.loss']}. {pronoun.capitalize()} typically works {row['hours.per.week']} hours per week."
    )
    
    return sentence


In [33]:
dataset['sentence'] = dataset.apply(generate_sentence, axis=1)

In [34]:
sentences = dataset['sentence']
sentences.iloc[0]

"The following is the information gathered about a person from the census income survey: A 39-year-old Male, born in United-States, is currently not married and works as an Administrative Clerical with a work class of type 'state government'. He holds a Bachelor’s degree, which corresponds to 13 years of education. His relationship status within the family is 'not-in-family', and his race is White. He has had a capital gain of $2174 and a capital loss of $0. He typically works 40 hours per week."

In [35]:
sentences = dataset['sentence']
sentences.iloc[4]

"The following is the information gathered about a person from the census income survey: A 28-year-old Female, born in Cuba, is currently married to a civilian spouse and works as an Professional Specialty with a work class of type 'private sector'. She holds a Bachelor’s degree, which corresponds to 13 years of education. Her relationship status within the family is 'wife', and her race is Black. She has had a capital gain of $0 and a capital loss of $0. She typically works 40 hours per week."

In [14]:
pd.DataFrame(sentences).reset_index().drop('index', axis=1) #.to_csv('data/census_income/census_income_sentences_two.csv', index=False)

Unnamed: 0,sentence
0,The following is the information gathered abou...
1,The following is the information gathered abou...
2,The following is the information gathered abou...
3,The following is the information gathered abou...
4,The following is the information gathered abou...
...,...
48837,The following is the information gathered abou...
48838,The following is the information gathered abou...
48839,The following is the information gathered abou...
48840,The following is the information gathered abou...


In [15]:
dataset

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,sentence
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,The following is the information gathered abou...
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,The following is the information gathered abou...
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,The following is the information gathered abou...
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,The following is the information gathered abou...
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,The following is the information gathered abou...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.,The following is the information gathered abou...
16277,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.,The following is the information gathered abou...
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.,The following is the information gathered abou...
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.,The following is the information gathered abou...


## **Model With Embeddings**

In [36]:
embeddings = pd.read_csv('data/embeddings_two.csv', index_col=0)

features_df = dataset.reset_index().drop(columns=["index", "sentence"])

dataset = features_df.copy()

In [37]:
dataset

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [19]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.032139,0.071867,-0.007886,0.052695,-0.075506,0.043765,0.004391,-0.029290,-0.113459,-0.014140,...,0.004788,-0.096420,0.059174,0.054538,-0.070650,-0.045039,0.044613,-0.119611,-0.013577,-0.027994
1,0.021484,0.069023,0.008012,0.061058,-0.069414,0.043304,0.003390,-0.044679,-0.092029,-0.004433,...,0.005815,-0.083965,0.070486,0.022959,-0.097008,-0.017320,0.023767,-0.086475,0.005117,-0.007993
2,0.021001,0.054209,0.003532,0.042072,-0.047621,0.049198,0.001531,-0.033915,-0.091951,-0.024021,...,-0.005852,-0.098320,0.063493,0.049092,-0.094019,-0.007333,0.075643,-0.096784,-0.017764,-0.007637
3,-0.014614,0.048819,-0.005284,0.068839,-0.061864,0.058125,0.008490,-0.067363,-0.112488,-0.012781,...,0.001265,-0.100361,0.051849,0.019471,-0.095333,-0.012139,0.057498,-0.096284,-0.045485,-0.002991
4,0.004204,0.041804,-0.008502,0.124579,-0.050307,0.075948,-0.042636,-0.029177,-0.067431,-0.006577,...,0.001610,-0.051525,0.120155,0.045476,-0.010707,-0.014043,-0.012373,-0.114517,-0.014849,-0.037741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.021877,0.033387,0.003269,0.060765,-0.059235,0.068964,-0.030491,-0.007904,-0.078635,-0.016283,...,-0.027383,-0.052514,0.071902,0.007854,-0.084812,0.024659,0.044435,-0.100517,-0.002036,-0.024184
48838,0.021239,0.076057,-0.009165,0.043226,-0.043372,0.057898,-0.023605,-0.052264,-0.102115,-0.036013,...,-0.029643,-0.096578,0.042340,0.048303,-0.102852,-0.030189,0.091876,-0.091696,-0.023752,-0.018141
48839,0.033779,0.061194,-0.006652,0.065603,-0.063992,0.052079,0.004481,-0.038760,-0.099008,-0.011992,...,0.000178,-0.087267,0.066552,0.035193,-0.087489,-0.018290,0.035685,-0.097819,-0.025926,-0.019455
48840,0.043459,0.062229,0.000002,0.057893,-0.069635,0.036610,-0.006385,-0.040110,-0.066589,-0.027605,...,0.009774,-0.086415,0.067361,0.050658,-0.084436,-0.001457,0.046852,-0.110469,-0.020653,-0.016724


**Feature Engineering (same as in baseline)**

In [38]:
# Identify Numeric features
numeric_features = ['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week','income']
cat_features = ['workclass','education','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Sex
dataset["sex"] = dataset["sex"].map({"Male": 0, "Female":1})

# Marital Status
dataset["marital.status"] = dataset["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
dataset["marital.status"] = dataset["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
dataset["marital.status"] = dataset["marital.status"].map({"Married":1, "Single":0})
dataset["marital.status"] = dataset["marital.status"].astype(int)

# Education
dummies_ed = pd.get_dummies(dataset['education'], prefix='education')
dataset = pd.concat([dataset, dummies_ed], axis=1)
dataset = dataset.drop('education', axis=1)

# Workclass
dataset['workclass'] = dataset['workclass'].str.replace('?', 'Unemployed')   # Missing values
dummies_w = pd.get_dummies(dataset['workclass'], prefix='workclass')
dataset = pd.concat([dataset, dummies_w], axis=1)
dataset = dataset.drop('workclass', axis=1)

# Occupation
dataset['occupation'] = dataset['occupation'].str.replace('?', 'Unemployed')   # Missing values
dummies_o = pd.get_dummies(dataset['occupation'], prefix='occupation')
dataset = pd.concat([dataset, dummies_o], axis=1)
dataset = dataset.drop('occupation', axis=1)

# Race 
dummies_r = pd.get_dummies(dataset['race'], prefix='race')
dataset = pd.concat([dataset, dummies_r], axis=1)
dataset = dataset.drop('race', axis=1)

# Relationship
dummies_re = pd.get_dummies(dataset['relationship'], prefix='relationship')
dataset = pd.concat([dataset, dummies_re], axis=1)
dataset = dataset.drop('relationship', axis=1)

# Native Country and fnlwgt dropped
dataset.drop(labels=["native.country", "fnlwgt"], axis = 1, inplace = True)

# Convert to bool
for col in dataset.columns:
    if dataset[col].dtype == 'bool':
        dataset[col] = dataset[col].astype(int)

In [39]:
dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1}).astype(int)

**Without PCA**

In [53]:
full_df = pd.concat([dataset, embeddings], axis=1)

In [54]:
full_df

Unnamed: 0,age,education.num,marital.status,sex,capital.gain,capital.loss,hours.per.week,income,education_10th,education_11th,...,374,375,376,377,378,379,380,381,382,383
0,39,13,0,0,2174,0,40,0,0,0,...,0.004788,-0.096420,0.059174,0.054538,-0.070650,-0.045039,0.044613,-0.119611,-0.013577,-0.027994
1,50,13,1,0,0,0,13,0,0,0,...,0.005815,-0.083965,0.070486,0.022959,-0.097008,-0.017320,0.023767,-0.086475,0.005117,-0.007993
2,38,9,0,0,0,0,40,0,0,0,...,-0.005852,-0.098320,0.063493,0.049092,-0.094019,-0.007333,0.075643,-0.096784,-0.017764,-0.007637
3,53,7,1,0,0,0,40,0,0,1,...,0.001265,-0.100361,0.051849,0.019471,-0.095333,-0.012139,0.057498,-0.096284,-0.045485,-0.002991
4,28,13,1,1,0,0,40,0,0,0,...,0.001610,-0.051525,0.120155,0.045476,-0.010707,-0.014043,-0.012373,-0.114517,-0.014849,-0.037741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,13,0,1,0,0,36,0,0,0,...,-0.027383,-0.052514,0.071902,0.007854,-0.084812,0.024659,0.044435,-0.100517,-0.002036,-0.024184
48838,64,9,0,0,0,0,40,0,0,0,...,-0.029643,-0.096578,0.042340,0.048303,-0.102852,-0.030189,0.091876,-0.091696,-0.023752,-0.018141
48839,38,13,1,0,0,0,50,0,0,0,...,0.000178,-0.087267,0.066552,0.035193,-0.087489,-0.018290,0.035685,-0.097819,-0.025926,-0.019455
48840,44,13,0,0,5455,0,40,0,0,0,...,0.009774,-0.086415,0.067361,0.050658,-0.084436,-0.001457,0.046852,-0.110469,-0.020653,-0.016724


**Models with no CV**

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import pickle

In [55]:
X_train, Y_train = full_df.iloc[:32561].drop('income', axis=1), full_df.iloc[:32561]['income']
X_test, Y_test = full_df.iloc[32561:].drop('income', axis=1), full_df.iloc[32561:]['income']

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('XGB', XGBClassifier()))

names = []
accuracies = []
aucs = []
precisions = []
recalls = []

for name, model in models:

    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    accuracy = accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)

    names.append(name)
    accuracies.append(accuracy)
    aucs.append(auc)
    precisions.append(precision_score(Y_test, predictions))
    recalls.append(recall_score(Y_test, predictions))

    msg = "%s: accuracy %f - AUC %f" % (name, accuracy, auc)
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR: accuracy 0.836988 - AUC 0.711358
KNN: accuracy 0.847798 - AUC 0.776174
CART: accuracy 0.811559 - AUC 0.734132
NB: accuracy 0.826915 - AUC 0.792944
RF: accuracy 0.836742 - AUC 0.726283
XGB: accuracy 0.861249 - AUC 0.783184


**Models with CV**

In [46]:
models = [
    #("LR", LogisticRegression()),
    #("KNN", KNeighborsClassifier()),
    #("CART", DecisionTreeClassifier()),
    #("NB", GaussianNB()),
    #("RF", RandomForestClassifier()),
    ("XGB", XGBClassifier()),
]

param_grids = {

    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
    },

    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },

    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },

    "GaussianNB": {"var_smoothing": [1e-9, 1e-8, 1e-7]},

    "RandomForestClassifier": {
        "n_estimators": [100, 200, 300],
        "max_features": [2, 3, 4],
        "max_depth": [None, 3, 5, 10],
    },

    "XGBClassifier": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    },

}

In [47]:
#cell runs in 2min15
skf = StratifiedKFold(n_splits=5)
searches = {}
SCORING_METRIC = "roc_auc"
for name, model in models:
    print(f"Running RandomizedSearchCV for {model.__class__.__name__}")
    search = RandomizedSearchCV(
        model,
        param_grids[model.__class__.__name__],
        cv=skf.split(X_train, Y_train),
        scoring=SCORING_METRIC,
        n_jobs=-1,
        )
    search.fit(X_train, Y_train)
    searches[name] = search

Running RandomizedSearchCV for XGBClassifier


In [48]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

In [49]:
# cell runs in 1min50
names = []
accuracies = []
aucs = []
precisions = []
recalls = []
best_thresholds = []
scores =[]
r2s = []
for name, search in searches.items():
    model = search.best_estimator_
    probas = model.predict_proba(X_train.values)[:, 1]
    best_threshold = find_best_threshold(Y_train, probas) # for f1

    # metrics on test set
    preds = (model.predict_proba(X_test.values)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    accuracy = accuracy_score(Y_test, preds)
    precision = precision_score(Y_test, preds)
    recall = recall_score(Y_test, preds)
    r2 = r2_score(Y_test, preds)

    names.append(name)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    r2s.append(r2)
    scores.append(search.best_score_)
    best_thresholds.append(best_threshold)

    msg = f"{name}: Best Threshold {best_threshold:.2f} - Accuracy {accuracy:.4f} - AUC {auc:.4f}"
    print(msg)

XGB: Best Threshold 0.40 - Accuracy 0.8625 - AUC 0.8115


In [50]:
metrics = pd.DataFrame(
    {
        #"Best threshold": best_thresholds,
        "Accuracy": accuracies,
        "AUC": aucs,
        "Precision": precisions,
        "Recall": recalls,
        "R2": r2s,
        "Training score": scores,
    },
    index=names,
).transpose()
metrics["Best Value"] = metrics.max(axis=1)
metrics["Best Model"] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,XGB,Best Value,Best Model
Accuracy,0.862478,0.862478,XGB
AUC,0.811466,0.811466,XGB
Precision,0.706502,0.706502,XGB
Recall,0.714769,0.714769,XGB
R2,0.23778,0.23778,XGB
Training score,0.921979,0.921979,XGB


**With PCA**

In [57]:
from sklearn.decomposition import PCA

In [74]:
pca = PCA(n_components=7)
embeddings_3d = pca.fit_transform(embeddings)

In [95]:
embeddings_df = pd.DataFrame(embeddings_3d, columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'])
full_df_pca = pd.concat([embeddings_df, dataset], axis=1)

In [96]:
from sklearn.preprocessing import StandardScaler

In [97]:
X_train, Y_train = full_df_pca.iloc[:32561].drop('income', axis=1), full_df_pca.iloc[:32561]['income']
X_test, Y_test = full_df_pca.iloc[32561:].drop('income', axis=1), full_df_pca.iloc[32561:]['income']

numeric_features = ['age','education.num','capital.gain','capital.loss','hours.per.week', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('XGB', XGBClassifier()))

names = []
accuracies = []
aucs = []
precisions = []
recalls = []

for name, model in models:

    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    accuracy = accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)

    names.append(name)
    accuracies.append(accuracy)
    aucs.append(auc)
    precisions.append(precision_score(Y_test, predictions))
    recalls.append(recall_score(Y_test, predictions))

    msg = "%s: accuracy %f - AUC %f" % (name, accuracy, auc)
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR: accuracy 0.854186 - AUC 0.767156
KNN: accuracy 0.832873 - AUC 0.749791
CART: accuracy 0.815798 - AUC 0.739421
NB: accuracy 0.586450 - AUC 0.706733
RF: accuracy 0.848044 - AUC 0.763404
XGB: accuracy 0.869664 - AUC 0.797133


**With CV**

In [98]:
models = [
    #("LR", LogisticRegression()),
    #("KNN", KNeighborsClassifier()),
    #("CART", DecisionTreeClassifier()),
    #("NB", GaussianNB()),
    #("RF", RandomForestClassifier()),
    ("XGB", XGBClassifier()),
]

param_grids = {

    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
    },

    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },

    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },

    "GaussianNB": {"var_smoothing": [1e-9, 1e-8, 1e-7]},

    "RandomForestClassifier": {
        "n_estimators": [100, 200, 300],
        "max_features": [2, 3, 4],
        "max_depth": [None, 3, 5, 10],
    },

    "XGBClassifier": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    },

}

In [99]:
#cell runs in 2min15
skf = StratifiedKFold(n_splits=5)
searches = {}
SCORING_METRIC = "roc_auc"
for name, model in models:
    print(f"Running RandomizedSearchCV for {model.__class__.__name__}")
    search = RandomizedSearchCV(
        model,
        param_grids[model.__class__.__name__],
        cv=skf.split(X_train, Y_train),
        scoring=SCORING_METRIC,
        n_jobs=-1,
        )
    search.fit(X_train, Y_train)
    searches[name] = search

Running RandomizedSearchCV for XGBClassifier


In [100]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

In [101]:
# cell runs in 1min50
names = []
accuracies = []
aucs = []
precisions = []
recalls = []
best_thresholds = []
scores =[]
r2s = []
for name, search in searches.items():
    model = search.best_estimator_
    probas = model.predict_proba(X_train.values)[:, 1]
    best_threshold = find_best_threshold(Y_train, probas) # for f1

    # metrics on test set
    preds = (model.predict_proba(X_test.values)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    accuracy = accuracy_score(Y_test, preds)
    precision = precision_score(Y_test, preds)
    recall = recall_score(Y_test, preds)
    r2 = r2_score(Y_test, preds)

    names.append(name)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    r2s.append(r2)
    scores.append(search.best_score_)
    best_thresholds.append(best_threshold)

    msg = f"{name}: Best Threshold {best_threshold:.2f} - Accuracy {accuracy:.4f} - AUC {auc:.4f}"
    print(msg)

XGB: Best Threshold 0.38 - Accuracy 0.8626 - AUC 0.8241


In [102]:
metrics = pd.DataFrame(
    {
        #"Best threshold": best_thresholds,
        "Accuracy": accuracies,
        "AUC": aucs,
        "Precision": precisions,
        "Recall": recalls,
        "R2": r2s,
        "Training score": scores,
    },
    index=names,
).transpose()
metrics["Best Value"] = metrics.max(axis=1)
metrics["Best Model"] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,XGB,Best Value,Best Model
Accuracy,0.862601,0.862601,XGB
AUC,0.824117,0.824117,XGB
Precision,0.692972,0.692972,XGB
Recall,0.75117,0.75117,XGB
R2,0.238461,0.238461,XGB
Training score,0.926134,0.926134,XGB


**Predicting only with embeddings**

In [103]:
full_df_3 = pd.concat([embeddings_df, dataset['income']], axis=1)

**With PCA**

In [72]:
X_train, Y_train = full_df_3.iloc[:32561].drop('income', axis=1), full_df_3.iloc[:32561]['income']
X_test, Y_test = full_df_3.iloc[32561:].drop('income', axis=1), full_df_3.iloc[32561:]['income']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('XGB', XGBClassifier()))

names = []
accuracies = []
aucs = []
precisions = []
recalls = []

for name, model in models:

    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    accuracy = accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)

    names.append(name)
    accuracies.append(accuracy)
    aucs.append(auc)
    precisions.append(precision_score(Y_test, predictions))
    recalls.append(recall_score(Y_test, predictions))

    msg = "%s: accuracy %f - AUC %f" % (name, accuracy, auc)
    print(msg)

LR: accuracy 0.815920 - AUC 0.698285
KNN: accuracy 0.790308 - AUC 0.673077
CART: accuracy 0.690314 - AUC 0.613363
NB: accuracy 0.817272 - AUC 0.706353
RF: accuracy 0.767766 - AUC 0.677716
XGB: accuracy 0.798170 - AUC 0.690885


**With CV**

In [None]:
models = [
    #("LR", LogisticRegression()),
    #("KNN", KNeighborsClassifier()),
    #("CART", DecisionTreeClassifier()),
    #("NB", GaussianNB()),
    #("RF", RandomForestClassifier()),
    ("XGB", XGBClassifier()),
]

param_grids = {

    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
    },

    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },

    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },

    "GaussianNB": {"var_smoothing": [1e-9, 1e-8, 1e-7]},

    "RandomForestClassifier": {
        "n_estimators": [100, 200, 300],
        "max_features": [2, 3, 4],
        "max_depth": [None, 3, 5, 10],
    },

    "XGBClassifier": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    },

}

In [None]:
#cell runs in 2min15
skf = StratifiedKFold(n_splits=5)
searches = {}
SCORING_METRIC = "roc_auc"
for name, model in models:
    print(f"Running RandomizedSearchCV for {model.__class__.__name__}")
    search = RandomizedSearchCV(
        model,
        param_grids[model.__class__.__name__],
        cv=skf.split(X_train, Y_train),
        scoring=SCORING_METRIC,
        n_jobs=-1,
        )
    search.fit(X_train, Y_train)
    searches[name] = search

Running RandomizedSearchCV for XGBClassifier


In [None]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

In [None]:
# cell runs in 1min50
names = []
accuracies = []
aucs = []
precisions = []
recalls = []
best_thresholds = []
scores =[]
r2s = []
for name, search in searches.items():
    model = search.best_estimator_
    probas = model.predict_proba(X_train.values)[:, 1]
    best_threshold = find_best_threshold(Y_train, probas) # for f1

    # metrics on test set
    preds = (model.predict_proba(X_test.values)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    accuracy = accuracy_score(Y_test, preds)
    precision = precision_score(Y_test, preds)
    recall = recall_score(Y_test, preds)
    r2 = r2_score(Y_test, preds)

    names.append(name)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    r2s.append(r2)
    scores.append(search.best_score_)
    best_thresholds.append(best_threshold)

    msg = f"{name}: Best Threshold {best_threshold:.2f} - Accuracy {accuracy:.4f} - AUC {auc:.4f}"
    print(msg)

XGB: Best Threshold 0.37 - Accuracy 0.8640 - AUC 0.8219


In [None]:
metrics = pd.DataFrame(
    {
        #"Best threshold": best_thresholds,
        "Accuracy": accuracies,
        "AUC": aucs,
        "Precision": precisions,
        "Recall": recalls,
        "R2": r2s,
        "Training score": scores,
    },
    index=names,
).transpose()
metrics["Best Value"] = metrics.max(axis=1)
metrics["Best Model"] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,XGB,Best Value,Best Model
Accuracy,0.864013,0.864013,XGB
AUC,0.821899,0.821899,XGB
Precision,0.700196,0.700196,XGB
Recall,0.74207,0.74207,XGB
R2,0.246291,0.246291,XGB
Training score,0.925893,0.925893,XGB


**Without PCA**

In [104]:
full_df_4 = pd.concat([embeddings, dataset[['income']]], axis=1)

In [105]:
X_train, Y_train = full_df_4.iloc[:32561].drop('income', axis=1), full_df_4.iloc[:32561]['income']
X_test, Y_test = full_df_4.iloc[32561:].drop('income', axis=1), full_df_4.iloc[32561:]['income']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('XGB', XGBClassifier()))

names = []
accuracies = []
aucs = []
precisions = []
recalls = []

for name, model in models:

    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    accuracy = accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)

    names.append(name)
    accuracies.append(accuracy)
    aucs.append(auc)
    precisions.append(precision_score(Y_test, predictions))
    recalls.append(recall_score(Y_test, predictions))

    msg = "%s: accuracy %f - AUC %f" % (name, accuracy, auc)
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR: accuracy 0.861126 - AUC 0.782565
KNN: accuracy 0.823905 - AUC 0.726410
CART: accuracy 0.780542 - AUC 0.690480
NB: accuracy 0.705055 - AUC 0.732385
RF: accuracy 0.834654 - AUC 0.723479
XGB: accuracy 0.838216 - AUC 0.751044


**With CV**

In [107]:
models = [
    #("LR", LogisticRegression()),
    #("KNN", KNeighborsClassifier()),
    #("CART", DecisionTreeClassifier()),
    #("NB", GaussianNB()),
    #("RF", RandomForestClassifier()),
    ("XGB", XGBClassifier()),
]

param_grids = {

    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
    },

    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },

    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },

    "GaussianNB": {"var_smoothing": [1e-9, 1e-8, 1e-7]},

    "RandomForestClassifier": {
        "n_estimators": [100, 200, 300],
        "max_features": [2, 3, 4],
        "max_depth": [None, 3, 5, 10],
    },

    "XGBClassifier": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
    },

}

In [108]:
#cell runs in 2min15
skf = StratifiedKFold(n_splits=5)
searches = {}
SCORING_METRIC = "roc_auc"
for name, model in models:
    print(f"Running RandomizedSearchCV for {model.__class__.__name__}")
    search = RandomizedSearchCV(
        model,
        param_grids[model.__class__.__name__],
        cv=skf.split(X_train, Y_train),
        scoring=SCORING_METRIC,
        n_jobs=-1,
        )
    search.fit(X_train, Y_train)
    searches[name] = search

Running RandomizedSearchCV for XGBClassifier


In [109]:
def find_best_threshold(y_true, y_prob):
    best_threshold = 0.5
    best_score = 0
    for threshold in np.arange(0.1, 0.9, 0.001):
        score = f1_score(y_true, y_prob >= threshold)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

In [110]:
# cell runs in 1min50
names = []
accuracies = []
aucs = []
precisions = []
recalls = []
best_thresholds = []
scores =[]
r2s = []
for name, search in searches.items():
    model = search.best_estimator_
    probas = model.predict_proba(X_train)[:, 1]
    best_threshold = find_best_threshold(Y_train, probas) # for f1

    # metrics on test set
    preds = (model.predict_proba(X_test)[:, 1] >= best_threshold).astype(int)

    auc = roc_auc_score(Y_test, preds)
    accuracy = accuracy_score(Y_test, preds)
    precision = precision_score(Y_test, preds)
    recall = recall_score(Y_test, preds)
    r2 = r2_score(Y_test, preds)

    names.append(name)
    aucs.append(auc)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    r2s.append(r2)
    scores.append(search.best_score_)
    best_thresholds.append(best_threshold)

    msg = f"{name}: Best Threshold {best_threshold:.2f} - Accuracy {accuracy:.4f} - AUC {auc:.4f}"
    print(msg)

XGB: Best Threshold 0.42 - Accuracy 0.8409 - AUC 0.7798


In [111]:
metrics = pd.DataFrame(
    {
        #"Best threshold": best_thresholds,
        "Accuracy": accuracies,
        "AUC": aucs,
        "Precision": precisions,
        "Recall": recalls,
        "R2": r2s,
        "Training score": scores,
    },
    index=names,
).transpose()
metrics["Best Value"] = metrics.max(axis=1)
metrics["Best Model"] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,XGB,Best Value,Best Model
Accuracy,0.840919,0.840919,XGB
AUC,0.779842,0.779842,XGB
Precision,0.663032,0.663032,XGB
Recall,0.664067,0.664067,XGB
R2,0.11829,0.11829,XGB
Training score,0.895625,0.895625,XGB
