In [1]:
import pandas as pd
import csv

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [8]:
def encode_label(label:str):
    if label == 'true': return 0
    if label == 'mostly-true': return 1
    if label == 'barely-true': return 2
    if label == 'half-true': return 3
    if label == 'false': return 4
    if label == 'pants-fire': return 5
    return -1

In [9]:
df = pd.read_csv('../data/politifact_plus.csv').drop(columns=['documented_time', 'author_score', 'summaries', 'article']).rename(columns={'when/where':'context'})
df['target'] = df['target'].apply(encode_label)
df.head()

Unnamed: 0,source,context,headline,target,speaker,src_true,src_mostly_true,src_half_true,src_mostly_false,src_false,src_pants_on_fire
0,Instagram posts,"stated on October 28, 2023 in a screenshot sha...",“Haaretz investigation reveals discrepancies i...,4,Madison Czopek,5.0,3.0,16.0,54.0,480.0,157.0
1,Scott Walker,"stated on May 30, 2023 in Interview:",“Wisconsin has historically … and I think larg...,2,Laura Schulte,26.0,45.0,39.0,41.0,44.0,11.0
2,Instagram posts,"stated on October 27, 2023 in a post:","“The airport in Salzburg, Austria, has a count...",4,Ciara O'Rourke,5.0,3.0,16.0,54.0,480.0,157.0
3,Viral image,"stated on October 27, 2023 in an Instagram post:",Video shows Palestinians pretending to be corp...,4,Ciara O'Rourke,4.0,13.0,35.0,53.0,764.0,340.0
4,Facebook posts,"stated on September 25, 2023 in a Facebook post:",The life span of a wind tower generator lasts ...,4,Loreben Tuquero,24.0,50.0,108.0,247.0,1532.0,595.0


In [10]:
from clickbait import ClickbaitModel
from sentiment import SentimentModel
from spam import SpamModel
from source_reliable import SourceReliableModel

X_train, X_test, y_train, y_test = train_test_split(df[['source', 'context', 'headline', 'speaker', 'target',
                                                        'src_true','src_mostly_true', 'src_half_true', 'src_mostly_false', 'src_false', 'src_pants_on_fire']], df['target'], test_size=.2, random_state=11)

clickM = ClickbaitModel()
sentiM = SentimentModel()
spamM = SpamModel()
srcM = SourceReliableModel()

sentiM.fit(X_train)
spamM.fit(X_train)
srcM.fit(X_train)


Training Accuracy: 0.952578
Testing Accuracy: 0.946250
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     12823
           1       0.95      0.95      0.95     12777

    accuracy                           0.95     25600
   macro avg       0.95      0.95      0.95     25600
weighted avg       0.95      0.95      0.95     25600

Training Accuracy: 0.977538
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       373
           0       0.98      1.00      0.99     20063
           1       0.00      0.00      0.00        88

    accuracy                           0.98     20524
   macro avg       0.33      0.33      0.33     20524
weighted avg       0.96      0.98      0.97     20524

Training Accuracy: 0.590431
              precision    recall  f1-score   support

       false       0.00      0.00      0.00      8406
        true       0.59      1.00      0.74     12118

    accuracy     

In [13]:
acc_lst = [0.946, 0.976, 0.412, 0.589]
weight = [acc/sum(acc_lst) for acc in acc_lst]
clickbaitV = clickM.predict(X_test)[1] * weight[0]
sentiV = sentiM.predict(X_test)[1] * weight[1]
spamV = spamM.predict(X_test)[1] * weight[2]
sourceV = srcM.predict(X_test)[1] * weight[3]

In [28]:
train_clickbaitV = clickM.predict(X_train)[1] * weight[0]
train_sentiV = sentiM.predict(X_train)[1] * weight[1]
train_spamV = spamM.predict(X_train)[1] * weight[2]
train_sourceV = srcM.predict(X_train)[1] * weight[3]

In [36]:
veracity_train = pd.DataFrame({'clickbait':train_clickbaitV, 'sentiment': train_sentiV, 'spam': train_spamV, 'source': train_sourceV})
veracity_test = pd.DataFrame({'clickbait':clickbaitV, 'sentiment': sentiV, 'spam': spamV, 'source': sourceV})

In [37]:
clf = LogisticRegression(solver='liblinear')
clf.fit(veracity_train, y_train)
clf.score(veracity_test, y_test)

0.3047138047138047