In [1]:
import pandas as pd
import numpy as np

In [2]:
import sklearn
from sklearn import linear_model, svm
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv("../../data/combined/processed/tweets_cn_news.csv")
data = data[["tweetId", "misleading", "likes", "replies", "retweets", "quotes"]]

data.rename(columns={"misleading": "label"}, inplace=True)
data.label = data.label.apply(lambda x: 1 if x else -1)

splits = pd.read_csv("../../data/combined/processed/splits.csv")
train_ids = set(splits[splits["split"] == "train"]["tweetId"].values)
val_ids = set(splits[splits["split"] == "val"]["tweetId"].values)

data_train = data[data["tweetId"].isin(train_ids)]
data_val = data[data["tweetId"].isin(val_ids)]

In [4]:
data

Unnamed: 0,tweetId,label,likes,replies,retweets,quotes
0,253611215190896642,1,468,273,431,258
1,443584873048309760,1,7984,8461,4276,10943
2,618437306769215488,1,831,775,1054,8
3,642098739272773632,-1,23676,1973,21133,4893
4,712708069369782272,1,34744,10776,18566,30613
...,...,...,...,...,...,...
10775,1603168527191506944,-1,0,0,0,0
10776,1603168547152318471,-1,0,0,0,0
10777,1603168554710204417,-1,0,0,0,0
10778,1603168595370024960,-1,0,0,0,0


In [5]:
model = svm.SVC(probability=True)
model.fit(data_train[["likes", "replies", "retweets", "quotes"]], data_train["label"])
preds = model.predict(data_val[["likes", "replies", "retweets", "quotes"]])
print(classification_report(data_val["label"], preds, target_names=["Not Misleading", "Misleading"]))

                precision    recall  f1-score   support

Not Misleading       0.85      0.97      0.91      1425
    Misleading       0.93      0.68      0.78       731

      accuracy                           0.87      2156
     macro avg       0.89      0.83      0.85      2156
  weighted avg       0.88      0.87      0.87      2156



In [6]:
print("Engagement score accuracy:", sklearn.metrics.accuracy_score(data_val["label"], preds))
print("Engagement score precision:", sklearn.metrics.precision_score(data_val["label"], preds))
print("Engagement score recall:", sklearn.metrics.recall_score(data_val["label"], preds))

Engagement score accuracy: 0.87291280148423
Engagement score precision: 0.9287054409005628
Engagement score recall: 0.6771545827633378


In [7]:
preds_all = model.predict(data[["likes", "replies", "retweets", "quotes"]])

preds_all_scores = model.predict_proba(data[["likes", "replies", "retweets", "quotes"]])
preds_all_scores = preds_all_scores[np.arange(preds_all_scores.shape[0]), preds_all]
preds_all_scores = preds_all_scores * preds_all

predictions = pd.DataFrame({"tweetId": data["tweetId"], "engagementlabel": preds_all, "engagementscore": preds_all_scores})
predictions.to_csv("engagementscores.csv", index=False)