In [1]:
import pandas as pd
import numpy as np
import json
import tqdm

In [2]:
import sklearn
from sklearn import linear_model, svm
from sklearn.metrics import classification_report

In [3]:
# Very High (VH), High (H), Mostly Factual (MF), Mixed (M), Low (L), Very Low (VL)
reliability_lookup = {
	"VL": 0,
	"L": 1,
	"M": 2,
	"MF": 3,
	"H": 4,
	"VH": 5,
}

In [4]:
links_df = pd.read_csv("community_notes_tweets_urls_reliability.csv")

In [5]:
links_dict = links_df.set_index("url").to_dict("index")
links_dict = {k: reliability_lookup.get(v["rating"], 2) for k, v in links_dict.items()}

In [6]:
with open("../../data/community-notes/downloads/tweets.json", "r") as f:
	cn_tweets = json.load(f)
cn_tweets = cn_tweets["data"]

In [7]:
with open("../../data/news-tweets/downloads/tweets.json", "r") as f:
	news_tweets = json.load(f)
news_tweets = news_tweets["data"]

In [8]:
def tweet_urls(tweet):
	if "entities" in tweet and "urls" in tweet["entities"]:
		return [url["expanded_url"] for url in tweet["entities"]["urls"]]
	else:
		return []

In [9]:
def tweet_urls_reliability(tweet):
	urls = tweet_urls(tweet)
	urls = [url for url in urls if url in links_dict]
	if len(urls) == 0:
		return 2.5
	scores = [links_dict.get(url, 2.5) for url in urls]
	return sum(scores) / len(scores)

In [10]:
cn_score_list = [{"tweetId": t["id"], "linkscore": tweet_urls_reliability(t)} for t in cn_tweets]
cn_score_df = pd.DataFrame(cn_score_list)

In [11]:
news_score_list = [{"tweetId": t["id"], "linkscore": tweet_urls_reliability(t)} for t in news_tweets]
news_score_df = pd.DataFrame(news_score_list)

In [12]:
linkscore_raw_df = pd.concat([cn_score_df, news_score_df])
linkscore_raw_df.rename(columns={"linkscore": "linkscore_raw"}, inplace=True)

In [13]:
labels_df = pd.read_csv("../../data/combined/processed/tweets_cn_news.csv", dtype={"tweetId": str})
labels_df = labels_df[["tweetId", "misleading"]]
labels_df.rename(columns={"misleading": "label"}, inplace=True)
labels_df.label = labels_df.label.apply(lambda x: 1 if x else -1)

In [14]:
splits_df = pd.read_csv("../../data/combined/processed/splits.csv", dtype={"tweetId": str})
splits_dict = splits_df.set_index("tweetId").to_dict()["split"]

train_ids = set(splits_df[splits_df.split == "train"].tweetId.tolist())
val_ids = set(splits_df[splits_df.split == "val"].tweetId.tolist())

In [15]:
df = pd.merge(labels_df, linkscore_raw_df, on="tweetId", how="inner")
train_df = df.loc[df.tweetId.isin(train_ids)].reset_index(drop=True)
val_df = df.loc[df.tweetId.isin(val_ids)].reset_index(drop=True)

In [16]:
# model = linear_model.LogisticRegression()
model = svm.SVC(probability=True)
model.fit(train_df[["linkscore_raw"]], train_df["label"])
preds = model.predict(val_df[["linkscore_raw"]])
print(classification_report(val_df["label"], preds, target_names=["Not Misleading", "Misleading"]))

                precision    recall  f1-score   support

Not Misleading       0.92      0.97      0.94      1425
    Misleading       0.93      0.84      0.88       731

      accuracy                           0.92      2156
     macro avg       0.93      0.90      0.91      2156
  weighted avg       0.92      0.92      0.92      2156



In [17]:
preds_all = model.predict(df[["linkscore_raw"]])

preds_all_scores = model.predict_proba(df[["linkscore_raw"]])
preds_all_scores = preds_all_scores[np.arange(preds_all_scores.shape[0]), preds_all]
preds_all_scores = preds_all_scores * preds_all

linkscore_df = df.copy()
linkscore_df["linkscore"] = preds_all_scores
linkscore_df["linklabel"] = preds_all
linkscore_df.drop(columns=["label"], inplace=True)
linkscore_df.to_csv("linkscore.csv", index=False)

In [18]:
linkscore_df

Unnamed: 0,tweetId,linkscore_raw,linkscore,linklabel
0,253611215190896642,2.0,-0.078777,-1
1,443584873048309760,2.0,-0.078777,-1
2,618437306769215488,2.5,0.933688,1
3,642098739272773632,2.0,-0.078777,-1
4,712708069369782272,2.5,0.933688,1
...,...,...,...,...
10775,1603168527191506944,4.0,-0.078631,-1
10776,1603168547152318471,4.0,-0.078631,-1
10777,1603168554710204417,3.0,-0.078762,-1
10778,1603168595370024960,3.0,-0.078762,-1
