In [5]:
import numpy as np
import re
import nltk
import pandas as pd
#from tensorflow.keras.preprocessing.text import Tokenizer
#from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch_geometric.data import HeteroData
import torch
import torch_geometric.transforms as T
from sklearn.preprocessing import RobustScaler
import pickle
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
# Read CSV file
file_path = r"/home/azureuser/rumour-detection-pheme/charliehebdo-all-rnr-threads.csv"
df = pd.read_csv(file_path)


In [8]:
df.time = pd.to_datetime(df.time, format='%a %b %d %H:%M:%S +0000 %Y')
df.reply_time = pd.to_datetime(df.reply_time, format='%a %b %d %H:%M:%S +0000 %Y')

### Time for replies and Number of replies

In [9]:
 df['time_diff']=(df.reply_time - df.time).dt.total_seconds()/60

In [10]:
df['reply_number'] = df.groupby('id')['time_diff'].rank(method='dense')

#### Number of replies x Retweet counts

In [11]:
df_posts = df[['id','text','followers','favorite_count','retweet_count','verified',\
  'rumour','user_id']].drop_duplicates().merge(df.groupby(['id']).agg(replies=(\
  'time_diff','count'),first_time_diff=('time_diff','first')).reset_index(),\
  on="id",how="left")

#### Word embeddings

In [12]:
# Download NLTK stopwords
nltk.download('stopwords')

english_stopwords = nltk.corpus.stopwords.words('english')

# function for cleaning data
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt


def clean_text(
    string: str,
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=english_stopwords) -> str:
    """
    A method to clean text
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower():
        if x in punctuations:
            string = string.replace(x, "")

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
df_posts['clean_text'] = np.vectorize(remove_pattern)(df_posts['text'], "@[\w]*")
df_posts['clean_text'] = df_posts['clean_text'].str.replace("[^a-zA-Z#]", " ").apply(clean_text)

  df_posts['clean_text'] = np.vectorize(remove_pattern)(df_posts['text'], "@[\w]*")


In [14]:
def embedding_vocab(filepath, word_index,embedding_dim):
    vocab_size = len(word_index) + 1


    embedding_matrix_vocab = np.zeros((vocab_size,
                                       embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix_vocab

In [16]:
from tensorflow import keras

#from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df_posts['clean_text'])

In [None]:
#import zipfile

#with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
#    zip_ref.extractall('.')

In [17]:
embedding_dim = 100
embedding_matrix_vocab = embedding_vocab(
    'glove.6B.100d.txt', tokenizer.word_index,
embedding_dim)

In [21]:
# Tokenize text
# create the dictionary
sequences = tokenizer.texts_to_sequences(df_posts['clean_text'])

# Padding sequences if necessary
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_sequence_length)

# Function to get embeddings for a sequence
def get_embeddings(sequence, embedding_matrix):
    embeddings = []
    for idx in sequence:
        embeddings.append(embedding_matrix[idx])
    return np.array(embeddings)

# Apply function to get embeddings for each sequence
df_posts['embeddings'] = [get_embeddings(seq, embedding_matrix_vocab) for seq in sequences]

In [22]:
array_avg = []
for i in df_posts.embeddings:
  array_avg.append(np.mean(i,axis=0))
df_posts['embeddings_avg'] = array_avg

#### Reply embeddings

In [23]:
df['clean_reply_text'] = np.vectorize(remove_pattern)(df['reply_text'], "@[\w]*")
df['clean_reply_text'] = df['clean_reply_text'].str.replace("[^a-zA-Z#]", " ").apply(clean_text)

  df['clean_reply_text'] = np.vectorize(remove_pattern)(df['reply_text'], "@[\w]*")


In [24]:
# create the dictionary
tokenizer =  keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['clean_reply_text'])

In [26]:
embedding_dim = 100
embedding_matrix_vocab = embedding_vocab(
    'glove.6B.100d.txt', tokenizer.word_index,
embedding_dim)

In [27]:
# Tokenize text
sequences = tokenizer.texts_to_sequences(df['clean_reply_text'])

# Padding sequences if necessary
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_sequence_length)

# Apply function to get embeddings for each sequence
df['reply_embeddings'] = [get_embeddings(seq, embedding_matrix_vocab) for seq in sequences]

In [28]:
array_avg = []
for i in df.reply_embeddings:
  array_avg.append(np.mean(i,axis=0))
df['reply_embeddings_avg'] = array_avg

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [29]:

df['reply_embeddings_avg'] = df['reply_embeddings_avg'].apply(lambda x: x if isinstance(x, np.ndarray) else np.zeros(100))

In [30]:
df = df.drop(columns=['Unnamed: 0'])

#### Saving cleaned csv's

In [31]:
df

Unnamed: 0,id,text,time,location,followers,user_id,favorite_count,retweet_count,verified,rumour,...,reply_time,reply_location,reply_followers,reply_user_id,reply_verified,time_diff,reply_number,clean_reply_text,reply_embeddings,reply_embeddings_avg
0,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",2015-01-07 11:06:08,Paris,1628,384779793,14,159,False,1,...,2015-01-07 11:24:15,,40,202572421,False,18.116667,1.0,religion peace strikes,"[[0.38767001032829285, 0.5266600251197815, 0.3...","[-0.1889099975426992, 0.3738733381032944, -0.2..."
1,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",2015-01-07 11:06:08,Paris,1628,384779793,14,159,False,1,...,2015-01-07 11:31:37,Zimbabwe-London,375,239943362,False,25.483333,2.0,hi henry would willing give itv news phone int...,"[[0.1444000005722046, 0.23978999257087708, 0.9...","[-0.12313784658908844, -0.18030815256329683, 0..."
2,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",2015-01-07 11:06:08,Paris,1628,384779793,14,159,False,1,...,2015-01-07 11:38:37,delhi,17,2903715212,False,32.483333,3.0,please call terrorists gunmen dont dilute news...,"[[-0.9112600088119507, 0.3958500027656555, 1.2...","[-0.19128870833665132, 0.15544349609408528, 0...."
3,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",2015-01-07 11:06:08,Paris,1628,384779793,14,159,False,1,...,2015-01-07 11:45:32,,54,2911191121,False,39.400000,4.0,french govt needs take strict action,"[[0.02704799920320511, -0.0538330003619194, 0....","[-0.06885033225019772, 0.2138711006846279, 0.1..."
4,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",2015-01-07 11:06:08,Paris,1628,384779793,14,159,False,1,...,2015-01-07 12:32:50,Shredsville,683,1348798826,False,86.700000,5.0,people didnt hand guns jeffrey epstein also cf...,"[[0.2901900112628937, 0.8049700260162354, 0.31...","[0.014819002151489258, 0.014486800134181976, -..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19045,553592195786506240,#BREAKING - Both #CharlieHebdo suspects killed...,2015-01-09 16:40:39,"Paris, France",1261930,1994321,41,133,True,0,...,2015-01-09 16:42:42,"Montreal, Canada",14085,7207042,False,2.050000,1.0,rt breaking charliehebdo suspects killed polic...,"[[-0.34817999601364136, -0.10100000351667404, ...","[-0.04054700657725334, -0.028997698239982127, ..."
19046,553592195786506240,#BREAKING - Both #CharlieHebdo suspects killed...,2015-01-09 16:40:39,"Paris, France",1261930,1994321,41,133,True,0,...,2015-01-09 16:42:54,قبرص(اليونان),189,2814170438,False,2.250000,2.0,coming,"[[0.0667089968919754, 0.2744799852371216, 0.81...","[0.0667089968919754, 0.2744799852371216, 0.814..."
19047,553592195786506240,#BREAKING - Both #CharlieHebdo suspects killed...,2015-01-09 16:40:39,"Paris, France",1261930,1994321,41,133,True,0,...,2015-01-09 16:43:11,Planet Earth,281,16573662,False,2.533333,3.0,happy hear two terrorists gone doom praise inv...,"[[-0.09043599665164948, 0.1963600069284439, 0....","[-0.10175399528816342, 0.3584062494337559, 0.2..."
19048,553592195786506240,#BREAKING - Both #CharlieHebdo suspects killed...,2015-01-09 16:40:39,"Paris, France",1261930,1994321,41,133,True,0,...,2015-01-09 16:46:50,UK,793,1067999394,False,6.183333,4.0,well done french ending chapter terror terrori...,"[[-0.5308600068092346, 0.5140399932861328, 0.0...","[0.047132634981112045, 0.16482700068842282, 0...."


In [46]:
df.to_pickle('replies_charlie_hebdo.pkl')

In [47]:
df_posts.to_pickle("posts_charlie_hebdo.pkl")

#### Creat Graph

In [147]:
df_posts = pd.read_pickle(r"/workspaces/rumour-detection-pheme/posts_charlie_hebdo.pkl")
df_replies = pd.read_pickle(r"/workspaces/rumour-detection-pheme/replies_charlie_hebdo.pkl")

In [148]:
#df_post_not_rumour = df_posts[df_posts.rumour==0]
#total_samples = int(df_posts[df_posts.rumour==1].count()[0]*1.1)

# Get the proportion of each value in the 'replies' column
#value_counts = df_post_not_rumour['replies'].value_counts(normalize=True)

# Calculate the number of samples for each value
#samples_per_value = (value_counts * total_samples).round().astype(int)

# Sample the required number of rows for each value in 'replies'
#sampled_dfs = []
#for value, num_samples in samples_per_value.items():
#    sampled_dfs.append(df_post_not_rumour[df_post_not_rumour['replies'] \
#                       == value].sample(num_samples, random_state=42))

# Concatenate all the samples into a single dataframe
#sampled_df = pd.concat(sampled_dfs)
# Shuffle the resulting dataframe
#sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
#df_posts = pd.concat([sampled_df,df_posts[df_posts.rumour==1]])\
#             .sample(frac=1,random_state=42).reset_index(drop=True)

#df_replies = df_replies[df_replies.id.isin(df_posts.id.values)][['id','reply_id',\
#             'reply_followers','reply_user_id','reply_verified',"rumour",\
#              "user_id","reply_embeddings_avg","time_diff"]]

#### Torch Data

In [149]:
# One-hot encoding
df_posts['verified'] = df_posts['verified'].astype('str').str.\
             replace(' ', '').replace('True', '1').replace('False', '0')\
             .astype('int64')

df_posts = pd.concat([df_posts, pd.get_dummies(\
                          df_posts["verified"],dtype=int)], axis=1, join='inner')
df_posts.drop(["verified"], axis=1, inplace=True)
df_posts.rename(columns={1:'verified',0:'no_verified'},inplace=True)

In [150]:
# One-hot encoding
df_replies['reply_verified'] = df_replies['reply_verified'].astype('str').str.\
             replace(' ', '').replace('True', '1').replace('False', '0')\
             .astype('int64')

df_replies = pd.concat([df_replies, pd.get_dummies(\
                          df_replies["reply_verified"],dtype=int)], axis=1, join='inner')
df_replies.drop(["reply_verified"], axis=1, inplace=True)
df_replies.rename(columns={1:'reply_verified',0:'reply_no_verified'},inplace=True)

In [151]:

# Get unique values from the column
unique_values = df_posts['id'].unique()

# Create a dictionary to map values to IDs
value_to_id = {value: i for i, value in enumerate(unique_values)}

# Map the values to IDs using the dictionary
#retweets_node_features['index'] = retweets_node_features['reply_user_id'].map(value_to_id)
post_map  = value_to_id

#Only keep features
post_features = df_posts[["followers", "favorite_count","retweet_count","no_verified","verified",\
                          "first_time_diff"]]


In [152]:
embeddings_avg = []
for embeddings in df_posts['embeddings_avg']:
  #embeddings= i.replace('[','').replace(']','').replace('\n','').split(' ')
  #embeddings =  [float(item) for item in embeddings if item != '']
  embeddings_avg.append(embeddings)

In [153]:


# Initialize the Robust Scaler
scaler = RobustScaler()

# Assuming data is a DataFrame containing your dataset
scaled_features = scaler.fit_transform(post_features[['followers', 'favorite_count', 'retweet_count', 'first_time_diff']])

# Convert the scaled features back to a DataFrame
scaled_data = pd.DataFrame(scaled_features, columns=['followers', 'favorite_count', 'retweet_count', 'first_time_diff'])

# Add the binary features back to the scaled data
scaled_data['no_verified'] = post_features['no_verified']
scaled_data['verified'] = post_features['verified']
post_features = scaled_data

In [154]:
# Convert to numpy
x1 = np.concatenate((post_features.to_numpy(),np.array(embeddings_avg)),axis=1)
#x1 = post_features.to_numpy()
x1.shape # [num_movie_nodes x movie_node_feature_dim]

(2002, 106)

In [155]:
# Select node features
retweets_node_features = df_replies[["reply_followers", "reply_no_verified", "reply_verified","reply_user_id","time_diff"]]

# Get unique values from the column
unique_values = retweets_node_features['reply_user_id'].unique()

# Create a dictionary to map values to IDs
value_to_id = {value: i for i, value in enumerate(unique_values)}

# Map the values to IDs using the dictionary
#retweets_node_features['index'] = retweets_node_features['reply_user_id'].map(value_to_id)
retweets_id_mapping  = value_to_id




#Only keep features
retweets_node_features = retweets_node_features[["reply_followers", "reply_no_verified", "reply_verified","time_diff"]]


In [156]:
embeddings_avg = []
for embeddings in df_replies['reply_embeddings_avg']:
  #embeddings= i.replace('[','').replace(']','').replace('\n','').split(' ')
  #embeddings =  [float(item) for item in embeddings if item != '']
  embeddings_avg.append(embeddings)

In [157]:
# Initialize the Robust Scaler
scaler = RobustScaler()

# Assuming data is a DataFrame containing your dataset
scaled_features = scaler.fit_transform(retweets_node_features[['reply_followers','time_diff']])

# Convert the scaled features back to a DataFrame
scaled_data = pd.DataFrame(scaled_features, columns=['reply_followers','time_diff'])

# Add the binary features back to the scaled data
scaled_data['reply_no_verified'] = retweets_node_features['reply_no_verified']
scaled_data['reply_verified'] = retweets_node_features['reply_verified']
retweets_node_features = scaled_data


In [158]:
# Convert to numpy
#x2 = np.concatenate((retweets_node_features.to_numpy(),np.array(embeddings_avg)),axis=1)
x2 = retweets_node_features.to_numpy()
x2.shape # [num_movie_nodes x movie_node_feature_dim]

(19050, 4)

In [159]:
# Extract labels
labels = df_posts.rumour
y = labels.to_numpy()
y.shape

(2002,)

In [160]:
# Map post IDs
#post_map = posts_id_mapping.reset_index().set_index("id").to_dict()
df_replies["id"] = df_replies['id'].map(post_map).astype(int)
# Map user IDs
#retweets_map = retweets_id_mapping #retweets_id_mapping.reset_index().set_index("reply_user_id").to_dict()
df_replies["reply_user_id"] = df_replies["reply_user_id"].map(retweets_id_mapping)#.astype(int)

In [161]:
edge_index = df_replies[["id", "reply_user_id"]].values.transpose()
edge_index # [2 x num_edges]

array([[    0,     0,     0, ...,  2001,  2001,  2001],
       [    0,     1,     2, ...,   284,  1810, 14464]])

In [162]:

# Number of rows
num_rows = x1.shape[0]

# Desired proportions
train_proportion = 0.70
val_proportion = 0.15
test_proportion = 0.15

# Generate a list of indices and shuffle them
indices = np.arange(num_rows)
np.random.shuffle(indices)

# Calculate the split indices
train_end = int(train_proportion * num_rows)
val_end = train_end + int(val_proportion * num_rows)

# Split the indices into train, validation, and test
train_indices = indices[:train_end]
val_indices = indices[train_end:val_end]
test_indices = indices[val_end:]

# Create masks
train_mask = np.zeros(num_rows, dtype=bool)
val_mask = np.zeros(num_rows, dtype=bool)
test_mask = np.zeros(num_rows, dtype=bool)

# Assign True to the corresponding indices
train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_indices] = True


In [163]:
# Initialize the HeteroData object
data = HeteroData()

# Add node features and labels for the 'paper' node type
data['id'].x = torch.tensor(x1,dtype=torch.float32)  # Node features dtype=torch.float32
data['id'].y = torch.from_numpy(y)  # Node labels (for example, binary classification)
data['id'].train_mask = torch.tensor(train_mask)  # Training mask
data['id'].val_mask =torch.tensor(val_mask)  # Validation mask
data['id'].test_mask = torch.tensor(test_mask)  # Test mask

# Add node features for the 'author' node type
data['reply_user_id'].x = torch.tensor(x2,dtype=torch.float32) #torch.float32

# Add edges for the (id, retweet, reply_D) relation
data['id', 'retweet', 'reply_user_id'].edge_index = torch.from_numpy(edge_index.reshape(2,len(x2)))
data = T.ToUndirected()(data)
print(data)


HeteroData(
  id={
    x=[2002, 106],
    y=[2002],
    train_mask=[2002],
    val_mask=[2002],
    test_mask=[2002],
  },
  reply_user_id={ x=[19050, 4] },
  (id, retweet, reply_user_id)={ edge_index=[2, 19050] },
  (reply_user_id, rev_retweet, id)={ edge_index=[2, 19050] }
)


In [164]:
print("train:",data['id'].y[train_mask].sum(),data['id'].y[train_mask].shape)

train: tensor(307) torch.Size([1401])


In [165]:
print("val:", data['id'].y[val_mask].sum(),data['id'].y[val_mask].shape)

val: tensor(65) torch.Size([300])


In [166]:
print("test:", data['id'].y[test_mask].sum(),data['id'].y[test_mask].shape)

test: tensor(75) torch.Size([301])


In [167]:
with open('charlie_hebdo_graph_dataset_node_embeddings_v2.pkl', 'wb') as f:
    pickle.dump(data, f)

In [168]:

import torch_geometric.transforms as T
from sklearn.metrics import precision_score, recall_score

In [169]:
# Load the HeteroData object from the pkl file
with open('charlie_hebdo_graph_dataset_node_embeddings_v2.pkl', 'rb') as f:
    data = pickle.load(f)