In [1]:
#import all required strings
from PreProcessor import PreProcessor
from Encoder import Encoder

# PreProcessor Demo Script
The following script demos the capabilities of the pre-processor and some preliminary analysis that we performed on the dataset itself

## Initializing the PreProcessor

### Create an instance of the PreProcessor class and import the dataset from a file

The preprocessor imports files based on the file name. The file must be located in the "datasets" folder. We automatically replace Null and NaN dataset entries with "N/A"

In [2]:
train_file_name = "train.tsv"
labels = [
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
    ]

#initialize the PreProcessor
pre_processor = PreProcessor(verbose=True)
pre_processor.import_data_from_file(
    file_name="train.tsv",
    deliminator='\t',
    headers = labels,
    replace_Null_NaN=True)


PreProcessor.__init()__: Data Imported


### Denoting which column in the dataset corresponds to the labels for each data sample

We provide a custom encoding (optional) so that each possible label can be encoded using a unique number. For additional flexibility, the labels (or any set of data for that matter) can be encoded based on the following options:
- Standard mapping: Labels are encoded either through the provided encoding_mapping or automatically using a unique integer for each label
- normalized mapping: When it makes sense, labels can be normalized so as to range from 0 to 1
- binarized mapping: Finally, labels can be binarized to be either 0 or 1. This generally only makes sense with only two labels or if data is specifically constructed to be binarized (ex: mostly true vs mostly false)

In [5]:

#set the label column
label_mapping = {'pants-fire':0,
             'false':1,
             'barely-true':2,
             'half-true':3,
             'mostly-true':4,
             'true':5}
pre_processor.set_label_header(
    label_header='label',
    encoding_mapping=label_mapping,
    normalize=False,
    binarize=False
)

Encoder.encode: encoding_mappings: {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}



In [8]:
desired_feature = "context"
subjects_encoder = pre_processor.get_bag_of_words_encoder_for_feature(
    feature_name=desired_feature,
    clean_strings=True,
    remove_stop_words=True,
    lematize=True
)

unique_words = subjects_encoder.encoding_mappings.keys()

features,counts = subjects_encoder.get_most_common_words(20)
print(features)
print(counts)

['interview', 'speech', 'ad', 'news', 'campaign', 'debate', 'press', 'release', 'state', 'tv', 'post', 'comment', 'fox', 'radio', 'conference', 'television', 'presidential', 'republican', 'statement', 'cnn']
[1754 1062  877  868  809  768  687  671  498  495  381  344  339  336
  300  282  252  247  245  237]


