In [1]:
import pandas as pd

In [83]:
# Clean affiliation column, utilizes one-hot encoding
def one_hot_affiliation(df):
    # keep only republican/democrat, set else to None
    df.loc[~df['affiliation'].isin(['republican', 'democrat']), 'affiliation'] = None

    # one hot encoding for CNN input
    affiliation_one_hot = pd.get_dummies(df['affiliation'], dummy_na=True, dtype=int)

    df = df.drop(['affiliation'], axis=1)
    df = df.join(affiliation_one_hot)
    return df

def truth_score(df):
    # TODO: tune these as neccessary
    label_map = {
        "pants-fire": 0.0,
        "false": 0.2,
        "barely-true": 0.4,
        "half-true": 0.6,
        "mostly-true": 0.8,
        "true": 1.0,
    }

    # map labels to scores
    truth_score = df['label'].map(label_map)

    df = df.drop(['label'], axis=1)
    df = df.join(truth_score.rename('truth_score'))
    return df

def one_hot_topic(df, min_count):
    # normalize topics: fill NaN, strip whitespace, lowercase, remove empty items
    df['topic'] = df['topic'].fillna('').astype(str).apply(lambda s: ','.join([t.strip().lower() for t in s.split(',') if t.strip()]))

    # create column for every distinct topic
    topics = df['topic'].str.get_dummies(sep=',')

    # compute counts per topic (how many rows include each topic)
    topic_counts = topics.sum(axis=0).sort_values(ascending=False)

    # choose topics that appear more/less than min_count
    kept_topics = topic_counts[topic_counts > min_count].index.tolist()
    removed_topics = topic_counts[topic_counts <= min_count].index.tolist()

    # DEBUG: uncomment to show what we're keeping and what we're removing with counts
    # print(f"Keeping {len(kept_topics)} topics with more than {min_count} occurrences.")
    # if kept_topics:
    #     print('Kept topics and counts:' )
    #     print(topic_counts.loc[kept_topics].sort_values(ascending=False).to_string())
    # else:
    #     print('No topics kept.')

    # print()
    # print(f"Removing {len(removed_topics)} topics with {min_count} or fewer occurrences.")
    # if removed_topics:
    #     print('Removed topics and counts:' )
    #     print(topic_counts.loc[removed_topics].sort_values(ascending=False).to_string())
    # else:
    #     print('No topics removed.')

    # select only the kept topic columns
    topic_one_hot = topics.loc[:, kept_topics] if kept_topics else pd.DataFrame(index=df.index)

    # drop original topic column and join one-hot columns
    df = df.drop(['topic'], axis=1)
    df = df.join(topic_one_hot)
    return df




In [None]:
# clean up training dataset 

# get dataset (df = dataframe)
train_df = pd.read_csv(r'liar_dataset/train.tsv', sep='\t', header=None)

# only keep label column (true, half-true, etc.), statement, topic 
# (energy, history, etc.), and political affiliation
train_df = train_df.iloc[:, [1, 2, 3, 7]]
train_df.columns = ['label', 'statement', 'topic', 'affiliation']

# set minimum occurrences threshold
min_count = 25

# normalize labels (0-1, pants-fire - truth)
train_df = truth_score(train_df)
# one hot encode affiliaiton
train_df = one_hot_affiliation(train_df)
# one hot encode topic, remove any topic occuring less than min_count
train_df = one_hot_topic(train_df, min_count)



In [85]:
train_df.head(15)


Unnamed: 0,statement,truth_score,democrat,republican,NaN,economy,health-care,taxes,federal-budget,education,...,islam,gambling,bush-administration,consumer-safety,redistricting,bankruptcy,public-service,tourism,food-safety,patriotism
0,Says the Annies List political group supports ...,0.2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,When did the decline of coal start? It started...,0.6,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Hillary Clinton agrees with John McCain ""by vo...",0.8,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Health care reform legislation is likely to ma...,0.2,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The economic turnaround started at the end of ...,0.6,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,The Chicago Bears have had more starting quart...,1.0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,Jim Dunnam has not lived in the district he re...,0.4,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,I'm the only person on this stage who has work...,0.6,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"However, it took $19.5 million in Oregon Lotte...",0.6,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Says GOP primary opponents Glenn Grothman and ...,0.8,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
