In [1]:
import glob
import os
import json
from enum import Enum
import numpy as np

In [2]:
class Label(Enum):
    Fake = 0
    Real = 1

dataset_dir = '/home/cyulin/Course/cse547/fp/FakeNewsNet/code/fakenewsnet_dataset'
out_dir = './data_v3'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

p_fake_paths = glob.iglob(dataset_dir + '/politifact/fake/*')
p_real_paths = glob.iglob(dataset_dir + '/politifact/real/*')

news_ids = []
news_users = []
news_labels = []
news_features = []

user_ids = []

news_id_map = {}
user_id_map = {}

news_to_users = {}
users_to_news = {}

In [3]:
def parse_file_new(names, label, news_ids, news_labels, news_users, user_ids):
    for file_path in names:
        path_items = file_path.split('/')
        n_id = path_items[-1]
        news_ids.append(n_id)
        news_labels.append(label)
        
        tweets_files = os.listdir(file_path + '/tweets')   
        u_id_list = []
        for filename in tweets_files:
            full_path = file_path + '/tweets/' + filename 
            with open(full_path) as json_file:
                data = json.load(json_file)
                u_id = data['user']['id']
                if u_id not in set(u_id_list):
                    u_id_list.append(u_id)
                    user_ids.append(u_id)
        news_users.append(u_id_list)

In [4]:
parse_file_new(p_fake_paths, Label.Fake, news_ids, news_labels, news_users, user_ids)
parse_file_new(p_real_paths, Label.Real, news_ids, news_labels, news_users, user_ids)

In [5]:
# convert to numpy array
news_ids = np.asarray(news_ids)
news_users = np.asarray(news_users)
news_labels = np.asarray(news_labels)
user_ids = np.asarray(user_ids)
print(len(user_ids))
user_ids = np.unique(user_ids)
print(len(user_ids))

403811
295716


In [6]:
# compute ip mapping
def uid_map_help(v):
    return np.array(list(map(user_ids_map.get, v)))

news_ids_map = {j:i for i, j in enumerate(news_ids)}
user_ids_map = {j:i for i, j in enumerate(user_ids)}

#news_ids = np.array(list(map(news_ids_map.get, news_ids)))
user_ids = np.array(list(map(news_ids_map.get, user_ids)))
news_users = np.array(map(uid_map_help, news_users))

In [7]:
fea_len = 5000

for u_ids in news_users:
    features = np.zeros(fea_len, dtype=np.int32)
    for u_id in u_ids:
        features[u_id%fea_len] = 1
    news_features.append(features)
news_features = np.asarray(news_features)

In [8]:
# shuffle the news arrays with same order
num_news = len(news_ids)
num_users = len(user_ids)

randomize = np.arange(num_news)
np.random.shuffle(randomize)

news_ids = news_ids[randomize]
news_users = news_users[randomize]
news_labels = news_labels[randomize]
news_features = news_features[randomize]

In [9]:
# write labels and features to file
out_path = out_dir + '/politifact.labels'
with open(out_path, 'w') as f:
    for i in range(num_news):
        f.write("{}\t".format(i))
        features = news_features[i]
        features.tofile(f, sep=" ")
        if news_labels[i] == Label.Fake:
            f.write('\t{}'.format('Fake'))
        else:
            f.write('\t{}'.format('Real'))
        f.write("\n")

In [10]:
# construct graph edges
for i, u_ids in enumerate(news_users):
    for u_id in u_ids:
        if u_id not in users_to_news:
                users_to_news[u_id] = [i]
        else:
                users_to_news[u_id].append(i)

In [11]:
edges = []
for u_id, n_ids in users_to_news.items():
    for i in range(len(n_ids)):
        for j in range(i+1,len(n_ids)):
            edges.append((n_ids[i], n_ids[j]))
edges = np.asarray(edges, dtype=np.int)

In [12]:
out_path = out_dir + '/politifact.edges'
np.savetxt(out_path, edges, fmt='%i')