In [1]:
import pandas as pd
import json
import sys
import os
import pandas as pd
import numpy as np

In [2]:
class Column(object):
    """
    Column module containing functionality to convert feature values into written text

    name: String corresponding to the name of the column in the data set.
    attribute: String corresponging to the text description of the column.
    col_type: The type of the column: binary, categorical or numerical.
    verb: The verb required to conjugate the attribute.
    encode_fn: The function used to encode categorical values of this column.

    """
    def __init__(self, name, attribute=None, col_type=None, verb=None, encode_fn=None):
        self.name = name
        self.attribute = attribute
        self.type = col_type
        self.verb = verb
        self.encode_fn = encode_fn

    def is_binary(self):
        return self.type == "binary"

    def is_categorical(self):
        return self.type == "categorical"

    def is_numerical(self):
        return self.type == "numerical"

    def create_sentence(self, value, imp_value, prefix, missing_word, replace_numbers, descriptive):
        """
        Parameters::
            value: The value of this column at a specific data point
            imp_value: The imputed value of this column at a specific data point.
            prefix: String containing the desired prefix to add at the beginning of the sentence ("", "the Patient", etc.)
            missing_word: String describing how to handle missing values (e.g. "", "is missing" "imp_replace")
            replace_numbers: Boolean indicating weather or not to replace numerical values with text (e.g. very low, high, normal)
            descriptive: Boolean indicating weather or not the sentence should be descriptive.

        Returns::
            String with a sentence describing the column and its value.
            In the case of missing values:
                1. If missing_word == "" the sentence is the empty string
                2. If missing_word == "imp_replace" the sentence is constructed using the imputed value
                3. For all other cases the sentence is constructed using the text in the string missing_word

        """
        if descriptive:
            return self.create_descriptive_sentence(value, imp_value, prefix, missing_word, replace_numbers)
        else:
            return self.create_basic_sentence(value, imp_value, prefix, missing_word, replace_numbers)

In [3]:
class Binary_Column(Column):
    """
    Binary Column submodule for columns with values in [1, 0, true, false, "1", "0", "true", "false"]

    verb: The positive from of the verb used to conjugate the attribute when value is 1, "1" or "True"
    neg_verb: Negative form of the verb used to conjugate the attribute when value is 0, "0" or "false"

    """
    def __init__(self, name, attribute, verb, neg_verb, encode_fn=None):
        self.neg_verb = neg_verb
        super().__init__(name, attribute, "binary", verb, encode_fn)


    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        sentence = ""

        if str(value).lower()  in ["1", "0", "true", "false"]:

            if int(value) == 1:

                sentence = prefix + " " + self.verb + " " + self.attribute

            elif int(value) == 0:

                sentence = prefix + " " + self.neg_verb + " " + self.attribute

        return sentence


    def create_basic_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):

        sentence = ""

        if str(value).lower()  in ["1", "0", "true", "false"]:

            if int(value) == 1:

                sentence = self.verb + " " + self.attribute + ": yes"

            elif int(value) == 0:

                sentence = self.neg_verb + " " + self.attribute +" : no"

        elif missing_word != "":
            sentence = self.verb + " " + self.attribute + ": " + missing_word
        return sentence

In [4]:
class Categorical_Column(Column):
    """
    Categorical Column submodule for columns with non-numerical values

    """
    def __init__(self, name, attribute, verb, encode_fn=None):
        super().__init__(name, attribute, "categorical", verb, encode_fn)

    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = prefix + self.attribute + " " + self.verb + " " + str(value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = prefix  + self.attribute + " " + self.verb + " " + missing_word
        elif missing_word == "imp_replace":
            sentence = prefix + self.attribute + " " + self.verb + " " + str(imp_value)
        return sentence


    def create_basic_sentence(self, value, imp_value, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            sentence = self.attribute + ": " + str(value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = self.attribute + ": " + missing_word
        elif missing_word == "imp_replace":
            sentence = self.attribute + ": " + str(imp_value)
        return sentence

In [5]:
class Numerical_Column(Column):
    """
    Numerical Column submodule for columns with numerical values

    avg: The average of the values observed for this column (to be computed usign Training set)
    sd: The standard deviation of the values observed for this column (to be computed usign Training set)

    """
    def __init__(self, name, attribute, verb, avg, sd, encode_fn = None):
        self.avg = avg
        self.sd = sd
        super().__init__(name, attribute, "numerical", verb, encode_fn)


    def create_descriptive_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        if len(prefix) != 0:
            prefix = prefix[:-1] + "'s "
        sentence = ""
        if str(value).lower() not in ["nan", "", "none", "missing"]:
            value = float(value)
            col_value = self.encode_number(value, replace_numbers)
            sentence = prefix  + self.attribute + " " + self.verb + " " + str(col_value)
        elif  missing_word not in ["", "imp_replace"]:
            sentence = prefix  + self.attribute + " " + self.verb + " " + missing_word
        elif missing_word == "imp_replace":
            col_value = self.encode_number(imp_value, replace_numbers)
            sentence = prefix  + self.attribute + " " + self.verb + " " + str(col_value)
        return sentence


    def create_basic_sentence(self, value, imp_value, prefix, missing_word, replace_numbers):
        sentence = ""
        if  str(value).lower() not in ["nan", "", "none", "missing"]:
            value = float(value)
            col_value = self.encode_number(value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        elif missing_word not in ["", "imp_replace"]:
            sentence = self.attribute + ": " + missing_word
        elif missing_word == "imp_replace":
            col_value = self.encode_number(imp_value, replace_numbers)
            sentence = self.attribute + ": " + str(col_value)
        return sentence

    def encode_number(self, value, replace_numbers):
        new_value = value
        if replace_numbers:
            if self.avg - 2*self.sd > value:
                new_value = "very low"
            elif self.avg - 2*self.sd <= value < self.avg - self.sd:
                new_value = "low"
            elif self.avg + 2*self.sd >= value > self.avg + self.sd:
                new_value = "high"
            elif self.avg + 2*self.sd < value:
                new_value = "very high"
            else:
                new_value = "normal"
        return new_value

In [6]:
class Table(object):
    """
    Table module containing tabular information for a specific id.

    name: String corresponding to the name of the table.
    df: Dataframe containing the tabular data for a specific id.
    columns: List of Column objects corresponing to the columns in df.
    metadata: String containing metadata information about this table structure.
    time_col: Name of the column in df containing the timestamp for each observation.
    imputer: Function used to impute the missing values in df.

    """

    def __init__(self, name="", df=pd.DataFrame(), columns=None, metadata=None, imputer=None):

        self.name = name
        self.columns = columns
        self.metadata = metadata
        self.df = df
        self.is_empty = pd.isna(df).all().all()

    def create_encoded_imputed_vectors(self):
        """
        Creates encoded and imputed versions of the table contents.
        """
        encoded_df =  pd.DataFrame()

        for column in self.columns:
            col_values = self.df[column.name]
            col_encoder = column.encode_fn
            labels = col_encoder(col_values[col_values.notnull()])
            encoded_df[column.name] = col_values
            encoded_df[column.name] = pd.Series(labels, index=col_values[col_values.notnull()].index)

        self.encodings = encoded_df


    def create_text(self, prefix, missing_word, descriptive, meta, sep = "</s>"):
        """
        Creates a timestamped dataframe; each row contains a String (paragraph) with all the tabular information for the
        corresponding timestamp.

        Paramteres::
            prefix: String containing the desired prefix to add at the beginning of each sentence ("", "the Patient", etc.)
            missing_word: String describing how to handle missing values (e.g. "", "is missing" "imp_replace")
            replace_numbers: Boolean indicating weather or not to replace numerical values with text (e.g. very low, high, normal)
            descriptive: Boolean indicating weather or not each sentence should be descriptive.
            meta: Boolean indicating weather or not to include meta information in the paragraphs.
            sep: String indicating what symbol to use at the end of the paragraph as a separator between tables.
        """
        self.text = pd.DataFrame()
        text = []

        for t_i in range(self.df.shape[0]):

            text_i = ""

            if meta & (len(str(self.metadata)) >1):
                text_i = self.metadata

            for column in self.columns:

                value = self.df.iloc[t_i][column.name]

                imp_value = "Unkwown"
                col_text = column.create_sentence(value, imp_value, prefix, missing_word, descriptive=descriptive, replace_numbers=True)

                if len(col_text) >0:
                    col_text += ", "

                text_i += col_text

            text_i = text_i[:-2]+ ". " + sep
            text.append(text_i)

        self.text["text"] =  text

    def create_embeddings(self):
        """
        Creates a timestamped dataframe; each row contains NLP embeddings for the paragraph of the corresponding timestamp.
        """
        embeddings = []

        for i in range(self.text.shape[0]):

            text = self.text.iloc[i]["text"]

            full_embedding = get_biobert_embeddings(text)[0]

            embeddings.append(full_embedding.reshape(-1))

        emb_df =  pd.DataFrame(np.array(embeddings))
        emb_df = emb_df.set_index(self.text.index)

        merged_df = pd.concat([self.text, emb_df], axis=1)
        merged_df = merged_df.rename({i: self.name + "_" + str(i) for i in range(len(embeddings[0]))}, axis='columns')

        self.embeddings = merged_df.drop(["text"], axis = 1)

In [7]:
train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
test_df = pd.read_csv("data/census_income/adult.test", sep=', ')

dataset = pd.concat([train_df, test_df])

#NaN are flagged as "?"
dataset['workclass'] = dataset['workclass'].replace('?', np.nan)
dataset['occupation'] = dataset['occupation'].replace('?', np.nan)
dataset['native.country'] = dataset['native.country'].replace('?', np.nan)

  train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
  test_df = pd.read_csv("data/census_income/adult.test", sep=', ')


**Test: Categorical Column**

In [8]:
workclassColumn = Categorical_Column(name='workclass', attribute='Work Class', verb='is')

In [9]:
workclassColumn.create_descriptive_sentence(value=np.NaN, prefix="The", missing_word="imp_replace", imp_value="Unknown")

TypeError: Categorical_Column.create_descriptive_sentence() missing 1 required positional argument: 'replace_numbers'

In [None]:
workclassColumn.create_basic_sentence(value=np.NaN, missing_word="imp_replace", imp_value="Unknown")

'Work Class: Unknown'

**Test: Numerical Column**

In [10]:
ageColumn = Numerical_Column(name='age', attribute='Age', verb='is', avg=dataset['age'].mean(), sd=dataset['age'].std())

In [11]:
ageColumn.create_basic_sentence(value=dataset['age'].iloc[0], prefix="", missing_word="imp_replace", imp_value=dataset['age'].mean(), replace_numbers=False)

'Age: 39.0'

In [12]:
ageColumn.create_descriptive_sentence(value=dataset['age'].iloc[0], prefix="The", missing_word="imp_replace", imp_value=dataset['age'].mean(), replace_numbers=False)

"Th's Age is 39.0"

**Test: Binary Column**

In [13]:
dataset['sex_binary'] = np.where(dataset['sex']=='Male', 1, 0)

In [14]:
sexColumn = Binary_Column(name='sex_binary', attribute='Male', verb='is', neg_verb='is not')

In [15]:
sexColumn.create_basic_sentence(value=1, imp_value="", prefix="The gender", missing_word="imp_replace", replace_numbers=False)

'is Male: yes'

In [16]:
sexColumn.create_descriptive_sentence(value=1, imp_value="", prefix="The gender", missing_word="imp_replace", replace_numbers=False)

'The gender is Male'

**Test on whole table**

In [17]:
index = 0

row = dataset.iloc[index]

ageColumn = Numerical_Column(name='age', attribute='Age', verb='is', avg=dataset['age'].mean(), sd=dataset['age'].std())
workclassColumn = Categorical_Column(name='workclass', attribute='Workclass', verb='is')
educationColumn = Categorical_Column(name='education', attribute='Education', verb='is')
education_numColumn = Numerical_Column(name='education.num', attribute='Education Number', verb='is', avg=dataset['education.num'].mean(), sd=dataset['education.num'].std())
marital_statusColumn = Categorical_Column(name='marital.status', attribute='Marital Status', verb='is')
occupationColumn = Categorical_Column(name='occupation', attribute='Occupation', verb='is')
relationshipColumn = Categorical_Column(name='relationship', attribute='Relationship', verb='is')
raceColumn = Categorical_Column(name='race', attribute="Race", verb='is')
sexColumn = Categorical_Column(name='sex', attribute="Gender", verb='is')
capital_gainColumn = Numerical_Column(name='capital.gain', attribute="Capital Gain", verb='is', avg=dataset['capital.gain'].mean(), sd=dataset['capital.gain'].std())
capital_lossColumn = Numerical_Column(name='capital.loss', attribute="Capital Loss", verb='is', avg=dataset['capital.loss'].mean(), sd=dataset['capital.loss'].std())
hours_per_weekColumn = Numerical_Column(name='hours.per.week', attribute="Hours per Week", verb='is', avg=dataset['hours.per.week'].mean(), sd=dataset['hours.per.week'].std())
native_countryColumn = Categorical_Column(name='native.country', attribute="Native Country", verb='is')

censusTable = Table(name="census", df = dataset, columns=[ageColumn, workclassColumn, educationColumn, education_numColumn, marital_statusColumn, occupationColumn, relationshipColumn, raceColumn, sexColumn, capital_gainColumn, capital_lossColumn, hours_per_weekColumn, native_countryColumn], metadata="Census Income Dataset: ", imputer=None)

In [18]:
censusTable.create_text(prefix="The person ", missing_word="is missing", descriptive=True, meta=True, sep="</s>")
censusTable.text

Unnamed: 0,text
0,Census Income Dataset: The person's Age is nor...
1,Census Income Dataset: The person's Age is nor...
2,Census Income Dataset: The person's Age is nor...
3,Census Income Dataset: The person's Age is hig...
4,Census Income Dataset: The person's Age is nor...
...,...
48837,Census Income Dataset: The person's Age is nor...
48838,Census Income Dataset: The person's Age is hig...
48839,Census Income Dataset: The person's Age is nor...
48840,Census Income Dataset: The person's Age is nor...


In [207]:
first_string = censusTable.text.iloc[0]["text"]
first_string

"Census Income Dataset: The person's Age is normal, The person's Workclass is State-gov, The person's Education is Bachelors, The person's Education Number is high, The person's Marital Status is Never-married, The person's Occupation is Adm-clerical, The person's Relationship is Not-in-family, The person's Race is White, The person's Gender is Male, The person's Capital Gain is normal, The person's Capital Loss is normal, The person's Hours per Week is normal, The person's Native Country is United-States. </s>"