In [1]:
#import all required strings
from LiarLiarPreProcessor import LiarLiarPreProcessor
import numpy as np

# LiarLiarPreProcessor Demo Script
The following script demos the capabilities of the Liar Liar pre-processor that we used to encode the data. This is a wrapper of the PreProcessor class designed specifically for the LiarLiar Dataset

## Initializing the LiarLiarPreProcessor

### Create an instance of the LiarLiarPreProcessor class and import the dataset from a file

The preprocessor imports files based on the file name. The file must be located in the "datasets" folder. We automatically replace Null and NaN dataset entries with "N/A"

When loading a dataset, the preProcessor automatically looks for a .tsv file named "train.tsv" in the "datasets" folder (must be in this folder) with the following column headers
1. 'id': the ID of the statement ([ID].json).
2. 'label': the label.
3. 'statement': the statement.
4. 'subjects': the subject(s).
5. 'speaker': the speaker.
6. 'speaker_job_title': the speaker's job title.
7. 'state_info': the state info.
8. 'party_affiliation': the party affiliation.

Column 9-13: the total credit history count, including the current statement.

9. 'count_1', pants on fire counts.
10. 'count_2',false counts.
11. 'count_3',barely true counts.
12. 'count_4',half true counts.
13. 'count_5',mostly on fire counts.

14. 'context': the context (venue / location of the speech or statement).

Asside from this, there are the following options:
* replace_Null_Nan: This option automatically replaces an Null or NaN values with 'N/A' in the dataset

In [2]:
liar_liar_pre_processor = LiarLiarPreProcessor(verbose=False)

#load the training data
liar_liar_pre_processor.import_training_data(
    file_name="train.tsv",
    deliminator='\t',
    custom_headers=None,
    replace_Null_NaN=True
)

### Denoting which column in the dataset corresponds to the labels for each data sample

We provide a custom encoding (optional) so that each possible label can be encoded using a unique number. For additional flexibility, the labels (or any set of data for that matter) can be encoded based on the following options:
- Standard mapping: Labels are encoded either through the provided encoding_mapping or automatically using a unique integer for each label
- normalized mapping: When it makes sense, labels can be normalized so as to range from 0 to 1
- binarized mapping: Finally, labels can be binarized to be either 0 or 1. This generally only makes sense with only two labels or if data is specifically constructed to be binarized (ex: mostly true vs mostly false)


By default, the labels for each data sample will come from the column titled: 'label' with the encoding as follows (although this can be changed):

{'pants-fire':0,
'false':1,
'barely-true':2,
'half-true':3,
'mostly-true':4,
'true':5}

In [3]:
#set the label column
liar_liar_pre_processor.set_label_header(
    label_header='label',
    custom_label_encoding=False,
    normalize=False,
    binarize=False
)

## Configuring and Obtaining a Dataset

### Configuring a Dataset

In [8]:
encoder_parameters = [
    {"encoder_name": "statement",
        "encoder_type":"bag-of-words",
        "feature_name":"statement",
        "clean_strings":True,
        "remove_stop_words":True,
        "lematize":True,
        "filtering" : {
            "filtering_enabled":False,
            "filtered_terms": []
        }
    },
    {"encoder_name": "party affiliation",
        "encoder_type":"encode",
        "feature_name":"party_affiliation",
        "encoding_mapping":None,
        "normalize":False,
        "Binarize":False,
        "filtering" : {
            "filtering_enabled":True,
            "filtered_terms": ['republican', 'democrat', 'none']
        }
    },
    {"encoder_name": "credit score",
        "encoder_type":"credit history",
        "feature_names":['count_1','count_2','count_3','count_4','count_5'],
        "compute_credit_history":True
    }
]

encoder_parameters = [
    {"encoder_name": "statement",
        "encoder_type":"bag-of-words",
        "feature_name":"statement",
        "clean_strings":True,
        "remove_stop_words":True,
        "lematize":True,
        "filtering" : {
            "filtering_enabled":False,
            "filtered_terms": []
        }
    }
]

#load the encoding configurations for the desired dataset

liar_liar_pre_processor.configure_encodings(encoder_parameters=encoder_parameters)

## Generate a dataset



In [9]:
y,X,X_headers = liar_liar_pre_processor.get_dataset()

In [10]:
#apply to the test set
y_test,X_test = liar_liar_pre_processor.apply_encodings_to_new_data('train.tsv')

In [11]:
np.where(X != X_test)

(array([], dtype=int64), array([], dtype=int64))