In [1]:
import collections
import numpy as np
import pandas as pd

# snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis

In [2]:
fnid_pth = "./data/FNID/fake news detection(FakeNewsNet)/fnn_train.csv"
liar_pth = "./data/FNID/fake news detection(LIAR)/liar_train.csv"

In [3]:
ABSTAIN = -1
FAKE = 0
REAL = 1

In [4]:
data = pd.read_csv(fnid_pth)
data.head(2)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn
0,3106,2011-01-25T06:00:00-05:00,Joe Wilkinson,A national organization says Georgia has one o...,['http://www.ajc.com/news/georgia-politics-ele...,['A coalition of government watchdog groups la...,A coalition of government watchdog groups last...,fake
1,5655,2012-04-02T11:42:20-04:00,Rick Scott,"Says Barack Obama's health care law ""will be t...",['http://www.youtube.com/watch?v=TaC0mKApf9Q&f...,['As Supreme Court justices embarked on three ...,As Supreme Court justices embarked on three da...,fake


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15212 entries, 0 to 15211
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       15212 non-null  int64 
 1   date                     15212 non-null  object
 2   speaker                  15212 non-null  object
 3   statement                15212 non-null  object
 4   sources                  15212 non-null  object
 5   paragraph_based_content  15212 non-null  object
 6   fullText_based_content   15212 non-null  object
 7   label_fnn                15212 non-null  object
dtypes: int64(1), object(7)
memory usage: 950.9+ KB


### Converting the label to numbers, to use it for the validation


In [6]:
data["label_numeric"] = data.apply(lambda x: int(x["label_fnn"] == "real"), axis=1)

### Initializing the sentiment analysis package, to use later


In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

### Retriving the labels or valuable information from each site


In [8]:
# # contacts a url, downloads the website's content and parses it.
# def get_parsed_html(url):
#     req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
#     webpage = urlopen(req).read()
#     parsed_html = BeautifulSoup(webpage)
#     return parsed_html

### www.politifact.com


In [9]:
# def get_poitifact_image_alt(url):
#     result = "abstain"
#     try:
#         parsed_html = get_parsed_html(url)
#         div = parsed_html.body.find("div", attrs={"class": "m-statement__meter"})
#         result = div.find("img", attrs={"class": "c-image__original"})["alt"]
#         time.sleep(3)
#     except Exception as e:
#         print(e)
#     return result

### www.snopes.com


In [10]:
# def get_snopes_image_alt(url):
#     result = "abstain"
#     try:
#         parsed_html = get_parsed_html(url)
#         div = parsed_html.body.find("div", attrs={"class": "media rating"})
#         result = div.find("img")["alt"]
#     except Exception as e:
#         print(e)
#     return result

### www.factcheck.org


In [11]:
# def get_factcheck_first_paragraph(url):
#     result = "abstain"
#     try:
#         parsed_html = get_parsed_html(url)
#         div = parsed_html.body.find("div", attrs={"class": "entry-content"})
#         # if the first paragraph starts with 'Q:' and the second with 'A:' than it is a Q & A style;
#         # take the second paragraph
#         # otherwise take the first.
#         parag = div.find_all("p")
#         if parag[0].text[0:3] == "Q: " and parag[1].text[0:3] == "A: ":
#             return parag[1].text
#         return parag[0].text
#     except Exception as e:
#         print(e)
#     return result

### www.factcheck.afp.com


In [12]:
# def get_factcheck_afp_title(url):
#     result = "abstain"
#     try:
#         parsed_html = get_parsed_html(url)
#         h3 = parsed_html.body.find("h3")
#         return h3.text
#     except Exception as e:
#         print(e)
#     return result

### www.twitter.com


In [13]:
# def extract_twitter_name(url):
#     start = url.find("https")
#     sub = url[20 + start : len(url)]  # removing 'https://twitter.com/'
#     index = sub.find("/")
#     if index == -1:
#         return sub
#     else:
#         return sub[:index]

### Retrieving urls of fact checking sites


In [14]:
fact_checking_sites = {
    "www.politifact.com": None,  # get_poitifact_image_alt
    "www.snopes.com": None,  # get_snopes_image_alt
    "www.twitter.com": None,  # extract_twitter_name
    "www.factcheck.org": None,  # get_factcheck_first_paragraph
    "factcheck.afp.com": None,  # get_factcheck_afp_title
    "www.washingtonpost.com/news/fact-checker": None,
    "www.realclearpolitics.com": None,
    "www.glennbeck.com": None,
}

In [15]:
# def sources_as_list(source, domain):
#     urls = source[1:-1].split(",")
#     u = []
#     for url in urls:
#         if domain in url:
#             u.append(url)
#     return u

In [16]:
# Initialize the new columns
for site in fact_checking_sites:
    data[site] = None
data_size = data.shape[0]
data_size

15212

In [17]:
fact_checking_sites_results = {
    "www.politifact.com": [None] * data_size,
    "www.snopes.com": [None] * data_size,
    "www.twitter.com": [None] * data_size,
    "www.factcheck.org": [None] * data_size,
    "factcheck.afp.com": [None] * data_size,
    "www.washingtonpost.com/news/fact-checker": [None] * data_size,
    "www.realclearpolitics.com": [None] * data_size,
    "www.glennbeck.com": [None] * data_size,
}

In [18]:
# Iterate through the records
# and looks through the sources for each fact-checking site
#
# Commented out because it takes hours to run (the sites will throttle too many requests)
# the results are presented below.


# with open("factchecking_results.txt", "a") as results:
#     for i, row in data.iterrows():
#         for site in fact_checking_sites:
#             sources = sources_as_list(row["sources"], site)
#             if len(sources) != 0:
#                 # print("{}".format(i))
#                 labels = ""
#                 for source in sources:
#                     handler = fact_checking_sites[site]
#                     if handler:
#                         # print("Handling: {} ++++++++++++++++++++++++++".format(site))
#                         source = str(source).strip()[1:-1]
#                         if len(labels) > 0:
#                             labels += ", " + handler(str(source))
#                         else:
#                             labels += handler(str(source))
#                         # print("Result: {} ++++++++++++++++++++++++++".format(labels))
#                     else:
#                         if len(labels) > 0:
#                             # print("Handling: {} ++++++++++++++++++++++++++".format(site))
#                             labels += ", " + source
#                         else:
#                             labels += source
#                     # print("Result: {} ++++++++++++++++++++++++++".format(labels))
#                 fact_checking_sites_results[site][i] = labels
#                 print("{} | {} | {}".format(i, site, labels))
#                 results.write("{} | {} | {}\n".format(i, site, labels))

In [19]:
# for site in fact_checking_sites:
#     data[site] = fact_checking_sites_results[site]

In [20]:
### ALTERNATIVE TO THE TWO COMMENTED OUT CELLS ABOVE. THE RESULTS OF CALLING THE APIS HAVE BEEN SERIALIZED TO FILE
### FOR REPRODUCABILITY, AND TO SAVE TIME.
apiResultsFile = open("./data/apiResults.txt", "r", encoding="utf-8")
for line in apiResultsFile:
    try:
        sr = line.split("|")
        row = int(sr[0].strip())
        col = sr[1].strip()
        data.at[row, col] = sr[2]
    except Exception as e:
        print(e)
apiResultsFile.close()

## Crowdsourcing - reading the results from the rated files, and adding them to the dataset


### www.glennbeck.com


In [21]:
glenbeck_ratings = pd.read_csv("./data/glennbeck_ratings.csv")

for i, row in glenbeck_ratings.iterrows():
    data.loc[data["id"] == row["id"], ["www.glennbeck.com"]] = row["www.glennbeck.com"]

### www.realclearpolitics.com/


In [22]:
rp_ratings = pd.read_csv("./data/realclearpolitics_ratings.csv")

for i, row in rp_ratings.iterrows():
    data.loc[data["id"] == row["id"], ["www.realclearpolitics.com"]] = row[
        "www.realclearpolitics.com"
    ]

### www.washingtonpost.com/news/fact-checker/


In [23]:
wp_ratings = pd.read_csv("./data/washingtonpost_ratings.csv")

for i, row in wp_ratings.iterrows():
    data.loc[
        data["id"] == row["id"], ["www.washingtonpost.com/news/fact-checker"]
    ] = row["www.washingtonpost.com/news/fact-checker"]

# Learning the labels with Snorkel


In [24]:
@labeling_function()
def label_snopes(row):
    label = row["www.snopes.com"]
    if label is not None:
        label = str(row["www.snopes.com"])
        if "real" in label:
            return REAL
        else:
            return FAKE
    else:
        return ABSTAIN

In [25]:
@labeling_function()
def label_wp(row):
    label = row["www.washingtonpost.com/news/fact-checker"]
    if label is not None:
        label = str(row["www.washingtonpost.com/news/fact-checker"])
        if "real" in label:
            return REAL
        else:
            return FAKE
    else:
        return ABSTAIN

In [26]:
@labeling_function()
def label_rp(row):
    label = row["www.realclearpolitics.com"]
    if label is not None:
        label = str(row["www.realclearpolitics.com"])
        if "real" in label:
            return REAL
        else:
            return FAKE
    else:
        return ABSTAIN

In [27]:
truth_o_meter = {
    "true": 4,
    "mostly-true": 3,
    "half-true": 2,
    "barely-true": 1,
    "mostly-false": -1,
    "false": -2,
    "pants-fire": -3,
}


@labeling_function()
def label_politifact(row):
    total_score = 0
    labels = row["www.politifact.com"]
    # print(labels)
    if labels:
        labels = str(row["www.politifact.com"]).split(",")
        # The last label has the newline character
        if len(labels) > 0:
            labels[-1] = labels[-1][:-2]
        for label in labels:
            # print(label)
            label = label.strip()
            if label in truth_o_meter:
                total_score += truth_o_meter[label]
    # print("score: {} ".format(total_score))
    if total_score > 0:
        return REAL
    if total_score < 0:
        return FAKE

    return ABSTAIN

In [28]:
def factcheck_sentiment(row, columnName):
    label = str(row[columnName])
    score = 0
    if label:
        claims = label[1:-1].split(",")
        for claim in claims:
            # print(claim)
            sentiment = sid.polarity_scores(claim)
            # print(sentiment)
            if sentiment["neg"] > sentiment["pos"]:
                score -= 1
            elif sentiment["pos"] > sentiment["neg"]:
                score += 1
        if score > 0:
            return REAL
        elif score < 0:
            return FAKE
        else:
            return ABSTAIN
    return ABSTAIN

In [29]:
@labeling_function()
def factcheckqa_sentiment(row):
    return factcheck_sentiment(row, "www.factcheck.org")

In [30]:
@labeling_function()
def factcheckafpqa_sentiment(row):
    return factcheck_sentiment(row, "factcheck.afp.com")

### Transfer Learning from the liar dataset


In [31]:
# Load the Liar dataset
liar = pd.read_csv(liar_pth)
liar.head(2)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label-liar
0,18178,2020-03-18T13:26:42-04:00,Instagram posts,"""COVID-19 started because we eat animals.""",['https://www.cdc.gov/coronavirus/2019-ncov/ca...,['Vegan Instagram users are pinning the 2019 c...,Vegan Instagram users are pinning the 2019 cor...,barely-true
1,3350,2011-03-04T09:12:59-05:00,Glenn Beck,Says Michelle Obama has 43 people on her staff...,['http://www.glennbeck.com/2011/02/25/while-wo...,['Glenn Beck rekindled a falsehood about the s...,Glenn Beck rekindled a falsehood about the siz...,pants-fire


In [32]:
# check the unique labels
labels = liar["label-liar"].unique()
labels

array(['barely-true', 'pants-fire', 'half-true', 'mostly-true', 'true',
       'false'], dtype=object)

In [33]:
counts = {}
# true speakers
counts_true = collections.Counter(
    liar[(liar["label-liar"] == "mostly-true") | (liar["label-liar"] == "true")][
        "speaker"
    ]
)
counts_true = dict(counts_true.most_common())
# false speakers
counts_false = collections.Counter(
    liar[(liar["label-liar"] == "false") | (liar["label-liar"] == "pants-fire")][
        "speaker"
    ]
)
counts_false = dict(counts_false.most_common())

In [34]:
false_percent = {}
for k, v in counts_false.items():
    total = v
    if k in counts_true:
        total += counts_true[k]
    false_percent[k] = v / total

In [35]:
true_percent = {}
for k, v in counts_true.items():
    total = v
    if k in counts_false:
        total += counts_false[k]
    true_percent[k] = v / total

In [36]:
@labeling_function()
def speaker(row):
    speaker = row["speaker"]
    if speaker in true_percent and true_percent[speaker] > 0.6:
        return REAL
    if speaker in false_percent and false_percent[speaker] > 0.6:
        return FAKE
    return ABSTAIN

## Training the snorkel model


In [37]:
data = data.sample(frac=1, random_state=1)
df_train = data[:12170]
df_valid = data[12170:]

lfs = [
    label_rp,
    label_wp,
    label_snopes,
    label_politifact,
    factcheckqa_sentiment,
    factcheckafpqa_sentiment,
    speaker,
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|██████████| 12170/12170 [00:01<00:00, 10716.34it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
label_rp,0,"[0, 1]",0.00797,0.006984,0.002219
label_wp,1,"[0, 1]",0.00986,0.008874,0.003122
label_snopes,2,[0],0.027691,0.026952,0.004108
label_politifact,3,"[0, 1]",0.244618,0.184717,0.071159
factcheckqa_sentiment,4,"[0, 1]",0.020789,0.020049,0.010764
factcheckafpqa_sentiment,5,"[0, 1]",0.000822,0.000822,0.000493
speaker,6,"[0, 1]",0.721282,0.216352,0.075842


In [38]:
# majority_model = MajorityLabelVoter()
# preds_train_majority = majority_model.predict(L=L_train)

L_valid = applier.apply(df=df_valid)
Y_valid = df_valid["label_numeric"].values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

100%|██████████| 3042/3042 [00:00<00:00, 11302.31it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
label_rp,0,"[0, 1]",0.007561,0.006903,0.003616,23,0,1.0
label_wp,1,"[0, 1]",0.012492,0.012163,0.005588,38,0,1.0
label_snopes,2,[0],0.028271,0.026956,0.004274,77,9,0.895349
label_politifact,3,"[0, 1]",0.241289,0.173899,0.062459,452,282,0.615804
factcheckqa_sentiment,4,"[0, 1]",0.020053,0.018738,0.011177,37,24,0.606557
factcheckafpqa_sentiment,5,"[0, 1]",0.001644,0.001644,0.000986,2,3,0.4
speaker,6,"[0, 1]",0.704471,0.204142,0.06739,1644,499,0.767149


In [39]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12170 entries, 3176 to 614
Data columns (total 17 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   id                                        12170 non-null  int64 
 1   date                                      12170 non-null  object
 2   speaker                                   12170 non-null  object
 3   statement                                 12170 non-null  object
 4   sources                                   12170 non-null  object
 5   paragraph_based_content                   12170 non-null  object
 6   fullText_based_content                    12170 non-null  object
 7   label_fnn                                 12170 non-null  object
 8   label_numeric                             12170 non-null  int64 
 9   www.politifact.com                        3746 non-null   object
 10  www.snopes.com                            337 non-

In [40]:
df_train[["id", "statement", "label_fnn", "label_numeric"]].to_csv(
    "./data/train_data.csv"
)
df_valid[["id", "statement", "label_fnn", "label_numeric"]].to_csv(
    "./data/valid_data.csv"
)

In [41]:
np.save("./data/L_train.npy", L_train)
np.save("./data/L_valid.npy", L_valid)