In [1]:
import pandas as pd
import numpy as np

In [2]:
import sklearn
from sklearn import linear_model, svm
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv("../../data/combined/processed/users_cn_news.csv")
data = data[["tweetId", "verified", "followers_count", "following_count", "tweet_count", "listed_count", "created_at"]]

labels_df = pd.read_csv("../../data/combined/processed/tweets_cn_news.csv")
labels_df = labels_df[["tweetId", "misleading"]]
labels_df.rename(columns={"misleading": "label"}, inplace=True)
labels_df.label = labels_df.label.apply(lambda x: 1 if x else -1)

data = pd.merge(data, labels_df, on="tweetId")

splits = pd.read_csv("../../data/combined/processed/splits.csv")
train_ids = set(splits[splits["split"] == "train"]["tweetId"].values)
val_ids = set(splits[splits["split"] == "val"]["tweetId"].values)

data_train = data[data["tweetId"].isin(train_ids)]
data_val = data[data["tweetId"].isin(val_ids)]

In [4]:
data

Unnamed: 0,tweetId,verified,followers_count,following_count,tweet_count,listed_count,created_at,label
0,1398781917097562117,0,743,584,14179,0,1298164547,1
1,1399409517637640193,0,311881,926,19709,934,1459021268,1
2,1399438157666557956,0,33348,1085,38544,278,1338947030,1
3,1399773024417689600,1,141753,2583,23923,2721,1306975918,1
4,1399775621069291520,0,13382,1014,12388,0,1233232636,1
...,...,...,...,...,...,...,...,...
10775,1447566135893929988,0,0,0,0,0,1671193465,1
10776,1497769670123925507,0,0,0,0,0,1671193465,1
10777,1557817195220049921,0,0,0,0,0,1671193465,-1
10778,1586105803144327168,0,0,0,0,0,1671193465,1


In [5]:
x_cols = ["verified", "followers_count", "following_count", "tweet_count", "listed_count", "created_at"]
y_col = "label"

In [6]:
model = svm.SVC(probability=True, class_weight="balanced", C=1.0, verbose=False)
model.fit(data_train[x_cols], data_train[y_col])
preds = model.predict(data_val[x_cols])
print(classification_report(data_val[y_col], preds, target_names=["Not Misleading", "Misleading"]))

                precision    recall  f1-score   support

Not Misleading       0.66      0.36      0.47      1425
    Misleading       0.34      0.63      0.44       731

      accuracy                           0.46      2156
     macro avg       0.50      0.50      0.46      2156
  weighted avg       0.55      0.46      0.46      2156



In [7]:
print("User score accuracy:", sklearn.metrics.accuracy_score(data_val[y_col], -preds))
print("User score precision:", sklearn.metrics.precision_score(data_val[y_col], -preds))
print("User score recall:", sklearn.metrics.recall_score(data_val[y_col], -preds))

User score accuracy: 0.5445269016697588
User score precision: 0.3409378960709759
User score recall: 0.3679890560875513


In [8]:
preds_all = model.predict(data[x_cols])

preds_all_scores = model.predict_proba(data[x_cols])
preds_all_scores = preds_all_scores[np.arange(preds_all_scores.shape[0]), preds_all]
preds_all_scores = preds_all_scores * preds_all

predictions = pd.DataFrame({"tweetId": data["tweetId"], "userlabel": preds_all, "userscore": preds_all_scores})
predictions.to_csv("userscores.csv", index=False)