In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
# column info
# data from https://www.kaggle.com/datasets/doanquanvietnamca/liar-dataset?resource=download
'''
Column 1: the ID of the statement ([ID].json).
Column 2: the label.
Column 3: the statement.
Column 4: the subject(s).
Column 5: the speaker.
Column 6: the speaker's job title.
Column 7: the state info.
Column 8: the party affiliation.
Column 9-13: the total credit history count, including the current statement.
9: barely true counts.
10: false counts.
11: half true counts.
12: mostly true counts.
13: pants on fire counts.
Column 14: the context (venue / location of the speech or statement).
'''

In [11]:
# load data
train = pd.read_csv('train.tsv', sep='\t', header = None, names = ["id", "label", "statement", 
        "subject", "speaker", "speaker_job", "state", "party", "barely_true_counts", "false_counts", 
        "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"])
test = pd.read_csv('test.tsv', sep='\t', header = None, names = ["id", "label", "statement", 
        "subject", "speaker", "speaker_job", "state", "party", "barely_true_counts", "false_counts", 
        "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"])
val = pd.read_csv('valid.tsv', sep='\t', header = None, names = ["id", "label", "statement", 
        "subject", "speaker", "speaker_job", "state", "party", "barely_true_counts", "false_counts", 
        "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"])
all_data = pd.concat([train, test, val], ignore_index=True)

In [12]:
# remove rows with na in most columns
train = train[train["barely_true_counts"].notna()].reset_index().drop("index", axis = 1)
test = test[test["barely_true_counts"].notna()].reset_index().drop("index", axis = 1)
val = val[val["barely_true_counts"].notna()].reset_index().drop("index", axis = 1)
all_data = all_data[all_data["barely_true_counts"].notna()].reset_index().drop("index", axis = 1)

In [13]:
# replace na in speaker_job, state, context with "unknown"
train = train.fillna("unknown")
test = test.fillna("unknown")
val = val.fillna("unknown")
all_data = all_data.fillna("unknown")

In [None]:
# subject
# keep subjects with counts >= 50, otherwise group into "other" category
# for list of subjects, keep the subject with the highest counts in subject counts
# subject counts only from training set, apply this subject count to test and val sets
#     (in practice, we would only see training set)
# for val and test sets:
#     if the list of subjects for a sample only contains subjects that are not in train, put "other"
# result: 97 subjects

In [14]:
# get counts for each subject only from train set

def get_subject_counts(df):
    subjects_counts = dict() # key: subject, value: counts

    # counts for each subject
    for index, row in df.iterrows():
        subject_list = row["subject"].lower().split(",")
        for subject in subject_list:
            if subject not in subjects_counts.keys():
                subjects_counts[subject] = 0
            subjects_counts[subject] += 1
    
    return subjects_counts

subjects_counts = get_subject_counts(train)

In [15]:
# adds column to df with subject with highest count
def add_single_subject(df, subject_counts):
    
    single_subject = [ ]
    
    for index, row in df.iterrows():
        subject_list = row["subject"].lower().split(",")
        
        subject_list_with_counts = [ ] # list of tuples (subject, subject count from subject_counts)
        for subject in subject_list:
            
            if subject in subject_counts.keys():
                subject_list_with_counts.append((subject, subject_counts[subject]))
                
            else: # unseen subject (not in train set), categorize as "other"
                subject_list_with_counts.append(("other", 0))
        
        # sort subject list by counts
        subject_list_with_counts = sorted(subject_list_with_counts, key=lambda tup:tup[1])[::-1]
        
        # replace subject list with subject with highest count
        if subject_list_with_counts[0][1] < 50: # other category if subject counts < 50
            single_subject.append("other") 
        else: 
            single_subject.append(subject_list_with_counts[0][0])
    
    # add column to df
    df["single_subject"] = single_subject
    
    return df

In [16]:
train = add_single_subject(train, subjects_counts)
test = add_single_subject(test, subjects_counts)
val = add_single_subject(val, subjects_counts)
all_data = add_single_subject(all_data, subjects_counts)

In [None]:
# context
# convert to lowercase, keep only alphanumeric characters, remove stop words, lemmatize words

In [8]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [17]:
# convert to all lower case, get rid of punctuation and non-alphanumeric characters except apostrophe
def preprocess(col):
    preprocessed = [ ]
    
    for text in col:
        new = text.lower()
        new = re.sub(r"[^A-Za-z0-9']", " ", new)
        preprocessed.append(new)
    
    return preprocessed

# tokenize context
def tokenize(col):
    tokenized = [ ]
    
    for samp in col:
        tokenized.append(samp.split())
    
    return tokenized

# remove stop words - useless/common/meaningless words
def remove_stopwords(col):
    stopwords = nltk.corpus.stopwords.words('english')
    cleaned = [ ]
    
    for samp in col:
        result = [ ]
        for word in samp:
            if word not in stopwords: result.append(word)
        cleaned.append(result)
    
    return cleaned

# lemmatization - changing format/tense of words to be the same (ex: sleeps --> sleep)
def lemmatize(col):
    wordnet_lemmatizer = WordNetLemmatizer()
    cleaned = [ ]
    
    for samp in col:
        result = [ ]
        
        for word in samp:
            lemmatized = wordnet_lemmatizer.lemmatize(word)
            if len(lemmatized) > 1 and not lemmatized.isnumeric():
                result.append(lemmatized)
                
        cleaned.append(result)
    
    return cleaned

# join tokens together again
def untokenize(col):
    untokenized = [ ]
    
    for samp in col:
        untokenized.append(" ".join(samp))
        
    return untokenized

def preprocess_df(df, col_name):
    col = df[col_name]
    
    # preprocess
    new_col = preprocess(col)
    new_col = tokenize(new_col)
    new_col = remove_stopwords(new_col)
    new_col = lemmatize(new_col)
    new_col = untokenize(new_col)
    
    # add cleaned column to df
    new_col_name = "cleaned_" + col_name
    df[new_col_name] = new_col
    
    return df

In [18]:
train = preprocess_df(train, "context")
test = preprocess_df(test, "context")
val = preprocess_df(val, "context")
all_data = preprocess_df(all_data, "context")

In [20]:
train.to_csv("train_cleaned.csv")
test.to_csv("test_cleaned.csv")
val.to_csv("val_cleaned.csv")

In [None]:
# speaker

In [255]:
speaker_counts = all_data.groupby("speaker").count().sort_values("id", ascending = False).reset_index()[["speaker", "id"]]


speaker_counts

Unnamed: 0,speaker,id
0,barack-obama,611
1,donald-trump,343
2,hillary-clinton,297
3,mitt-romney,212
4,john-mccain,189
...,...,...
3304,james-vincent,1
3305,jamie-oliver,1
3306,jamie-samons,1
3307,jamie-weinstein,1


In [191]:
# speaker job

In [247]:
# replace "xx" with "unknown"
train = train.replace({'speaker_job': {"XX": "unknown"}})
test = test.replace({'speaker_job': {"XX": "unknown"}})
val = val.replace({'speaker_job': {"XX": "unknown"}})
all_data = all_data.replace({'speaker_job': {"XX": "unknown"}})

In [248]:
def preprocess2(col):
    preprocessed = [ ]
    
    for text in col:
        new = text.lower()
        new = re.sub(r'[^0-9A-Za-z ,-]+', "", new)
        preprocessed.append(new)
    
    return preprocessed

all_data["new_job"] = preprocess2(all_data["speaker_job"])
job_counts = all_data.groupby("new_job").count().sort_values("id", ascending = False).reset_index()[["new_job", "id"]]

job_counts

Unnamed: 0,new_job,id
0,unknown,3567
1,us senator,923
2,president,615
3,governor,537
4,president-elect,343
...,...,...
1271,government agency,1
1272,governor and chair of democratic governors ass...,1
1273,governor of georgia,1
1274,governor of pennsylvania,1


In [252]:
job_counts[job_counts["id"] >= 25]

Unnamed: 0,new_job,id
0,unknown,3567
1,us senator,923
2,president,615
3,governor,537
4,president-elect,343
5,us representative,340
6,presidential candidate,315
7,state senator,253
8,state representative,224
9,former governor,213


In [225]:
job_counts = all_data.groupby("speaker_job").count().sort_values("id", ascending = False).reset_index()[["speaker_job", "id"]]
job_counts[job_counts["id"] >= 10]



Unnamed: 0,speaker_job,id
0,unknown,3565
1,President,615
2,U.S. Senator,595
3,Governor,487
4,President-Elect,343
...,...,...
113,"Host, Last Week Tonight",10
114,congressman,10
115,"Member, Austin City Council",10
116,U.S. Representative,10


In [231]:
job_counts.to_csv("job_counts.csv")

In [None]:
# group together senate/senator/us senator/state senator
# group together congressman/congresswoman/representative/house of representatives/house member/member of us house/member of congress
# group together columnist


In [229]:
job_counts["speaker_job"]

0                                             unknown
1                                           President
2                                        U.S. Senator
3                                            Governor
4                                     President-Elect
                            ...                      
1351                                    Marlins owner
1352                                Massage therapist
1353    Mayor Jim Kenney's director of communications
1354                                Mayor of Braddock
1355                                 Olympic medalist
Name: speaker_job, Length: 1356, dtype: object

In [201]:
a = all_data.groupby("speaker_job").count().sort_values("index", ascending = False)["index"].tolist()

b = 0
for val in a:
    if val < 10: b += 1

b

1238