# Graph creation

This notebook takes the output of the preprocessing notebook and creates the graph used for model training. This notebook uses existing features to create new features such as retweet ratio or hashtag count, and it converts the columns into a data format suitable for GNN model use. This notebook also tokenizes and creates word embeddings for each tweet.

You may have to `pip install` some of the libraries below.

In [None]:
import pandas as pd
import getpass
from pandas import json_normalize
import numpy as np
import math
from collections import Counter
import ast
from tqdm import tqdm

import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import gensim
import gensim.downloader as api

In [None]:
user = getpass.getuser()
# Change dirpath to location of TwiBot-22 dataset
dirpath = f'/scratch/{user}/datasets/TwiBot22/'
labels = pd.read_csv(f'{dirpath}/label.csv')
users = pd.read_json(f'{dirpath}/user.json')

In [115]:
tweet_arr = []
for i in range(9):
    tweet_arr.append(pd.read_csv(f'{dirpath}/processed_tweet{i}.csv'))

  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))
  tweet_arr.append(pd.read_csv(f'/scratch/bae9wk/datasets/TwiBot22/processed_tweet{i}.csv'))


In [None]:
tweets = pd.concat(tweet_arr)

In [117]:
len(tweets)

2132482

In [118]:
tweets[:2]

Unnamed: 0.1,Unnamed: 0,author_id,mentioned_ids,mention_count,hashtag_count,symbols_count,urls_count,retweet_count,reply_count,like_count,quote_count,text,in_reply_to_user_id,created_at,conversation_id,id
0,107,9.992381e+17,[36054548],1.0,0.0,0.0,0.0,0.0,,0.0,,@KalengaKamwendo About that 🤦‍♂️,,,,
1,249,2838935000.0,[3123238004],1.0,0.0,0.0,0.0,347.0,,0.0,,RT @afcstuff: Mikel Arteta on the late winner:...,,,,


# Feature creation

In [119]:
def add_relationship(df):
    if (str(df["text"])[:2] == "RT"):
        return "Retweet"
    elif (pd.notna(df["in_reply_to_user_id"])):
        return "Reply"
    else:
        return "Post"

In [120]:
tweets["relationship"] = tweets.apply(lambda x : add_relationship(x), axis=1)

In [121]:
def target_user(df):
    if (df["relationship"] == "Retweet"):
        return df["mentioned_ids"]
    elif (df["relationship"] == "Reply"):
        return df["in_reply_to_user_id"]
    else:
        return np.nan

In [122]:
tweets["target_user_id"] = tweets.apply(lambda x : target_user(x), axis=1)

In [123]:
tweets["target_user_id"][:5]

0             NaN
1    [3123238004]
2     [343627165]
3             NaN
4             NaN
Name: target_user_id, dtype: object

In [124]:
tweets = tweets.drop(["in_reply_to_user_id", "mentioned_ids"], axis=1)

In [125]:
tweets = tweets.rename({"author_id": "source_user_id", "id": "tweet_id"}, axis=1)

In [126]:
unique_ids = set(np.concatenate((tweets["source_user_id"],tweets[tweets["target_user_id"].notnull()]["target_user_id"])))

In [127]:
len(unique_ids)

782578

In [130]:
users[:1]

Unnamed: 0,created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,False,"{'followers_count': 7316, 'following_count': 2...",https://t.co/BoMip9FF17,boazbaraktcs,False,


In [131]:
users = pd.concat([users, json_normalize(users["entities"])], axis = 1)
users = pd.concat([users, json_normalize(users["public_metrics"])], axis = 1)

In [132]:
users[:1]

Unnamed: 0,created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,...,withheld,url.urls,description.urls,description.mentions,description.hashtags,description.cashtags,followers_count,following_count,tweet_count,listed_count
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,False,"{'followers_count': 7316, 'following_count': 2...",...,,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...","[{'start': 41, 'end': 64, 'url': 'https://t.co...",,,,7316,215,3098,69


In [133]:
users["url.urls"] = users["url.urls"].fillna(0) 
users["url.urls"] = users["url.urls"].apply(lambda x : len(x) if x else 0)
users["description.urls"] = users["description.urls"].fillna(0) 
users["description.urls"] = users["description.urls"].apply(lambda x : len(x) if x else 0)
users["description.mentions"] = users["description.mentions"].fillna(0) 
users["description.mentions"] = users["description.mentions"].apply(lambda x : len(x) if x else 0)
users["description.hashtags"] = users["description.hashtags"].fillna(0) 
users["description.hashtags"] = users["description.hashtags"].apply(lambda x : len(x) if x else 0)
users["description.cashtags"] = users["description.cashtags"].fillna(0) 
users["description.cashtags"] = users["description.cashtags"].apply(lambda x : len(x) if x else 0)
users = users.drop(['entities', 'public_metrics'], axis = 1)
users["source_user_id"] = users["id"].apply(lambda x : int(x.replace("u", "")))

In [134]:
def convert_timestamps(ts):
    
    try:
        return pd.to_datetime(ts, exact = False, errors = 'raise')
    except ValueError:
        try:
            float_ts = float(ts)
        except Exception as e:
            print(ts)
            print(e)
            return None
        else:
            return pd.to_datetime(float_ts / 1e9, unit = 's', utc = True)

In [135]:
users['created_at_ts'] = users.apply(lambda x : convert_timestamps(x['created_at']), axis=1)

In [136]:
users['created_at_ts'].dtype

datetime64[ns, UTC]

In [137]:
users['username_length'] = users['username'].apply(len)
users['name_length'] = users['name'].apply(len)
users['description_length'] = users['description'].apply(len)

In [138]:
users['num_digits_username'] = users['username'].apply(lambda x: sum(c.isdigit() for c in x))

In [139]:
def entropy(s):
    probabilities = [n_x/len(s) for x, n_x in Counter(s).items()]
    e = -sum([p * math.log(p) / math.log(2.0) for p in probabilities])
    return e

In [140]:
users['username_entropy'] = users['username'].apply(entropy)
users['description_entropy'] = users['description'].apply(entropy)

In [141]:
def string_similarity(s1, s2):
    common_chars = sum(1 for char1, char2 in zip(s1, s2) if char1 == char2)
    combined_length = len(s1) + len(s2)
    similarity = (2 * common_chars) / combined_length if combined_length > 0 else 0
    return similarity

In [142]:
users['names_similarity'] = users.apply(lambda x: string_similarity(x['name'], x['username']), axis = 1)

In [143]:
users['names_ratio'] = users.apply(lambda x: len(x['username']) / len(x['name']) if len(x['name']) > 0 else 0, axis = 1)

In [144]:
users['reputation'] = users['following_count'] / (users['followers_count'] + 1e-9)

In [145]:
today = pd.Timestamp.now().tz_localize('UTC')
users['account_age'] = (today - users['created_at_ts']).dt.days / users['tweet_count']

In [146]:
tweets_2 = tweets.copy()

In [147]:
problematic_rows = []

for ind, row in tweets_2.iterrows():
    try:
        _ = int(row['source_user_id'])
    except ValueError:
        problematic_rows.append(ind)

In [149]:
graph = tweets_2.drop(problematic_rows).reset_index(drop=True)

In [150]:
graph['created_at_ts'] = graph.apply(lambda x : convert_timestamps(x['created_at']), axis=1)

IOStream.flush timed out


In [151]:
graph['source_user_id'] = graph['source_user_id'].apply(lambda x: int(x))

In [152]:
graph[:2]

Unnamed: 0.1,Unnamed: 0,source_user_id,mention_count,hashtag_count,symbols_count,urls_count,retweet_count,reply_count,like_count,quote_count,text,created_at,conversation_id,tweet_id,relationship,target_user_id,created_at_ts
0,107,999238148075991040,1.0,0.0,0.0,0.0,0.0,,0.0,,@KalengaKamwendo About that 🤦‍♂️,,,,Post,,NaT
1,249,2838934563,1.0,0.0,0.0,0.0,347.0,,0.0,,RT @afcstuff: Mikel Arteta on the late winner:...,,,,Retweet,[3123238004],NaT


In [153]:
graph['retweet_count'] = graph['retweet_count'].fillna(0)
graph['reply_count'] = graph['reply_count'].fillna(0)
graph['like_count'] = graph['like_count'].fillna(0)
graph['quote_count'] = graph['quote_count'].fillna(0)


In [154]:
user_ids = set(np.concatenate((graph['source_user_id'] , graph[graph['target_user_id'].notnull()]['target_user_id'])))
user = users[users["source_user_id"].isin(user_ids)]

In [155]:
labels["source_user_id"] = labels["id"].apply(lambda x : int(x.replace("u", "")))

In [156]:
user = user.merge(labels, left_on="source_user_id", right_on="source_user_id", how = "inner")

In [157]:
#tweets = tweets.drop(["pinned_tweet_id", "created_at", "withheld"], axis=1)

In [158]:
def retweet_ratio(s):
    if 'Retweet' in s.value_counts():
        return (s.value_counts()['Retweet'] + 1) / (len(s) + 3)
    else:
        return 1/(len(s) + 3)
retweet_ratios = pd.DataFrame(graph.groupby('source_user_id')['relationship'].agg(retweet_ratio))

retweet_ratios['source_user_id'] = retweet_ratios.index

retweet_ratios.reset_index(drop = True, inplace = True)

retweet_ratios.columns = ['retweet_ratio', 'source_user_id']

In [159]:
users['verified'] = users['verified'].apply(lambda x: bool(x))

In [160]:
user = pd.merge(user, retweet_ratios, on = 'source_user_id', how = 'left')

user['retweet_ratio'] = user.groupby('label')['retweet_ratio'].transform(lambda x: x.fillna(x.mean()))

In [161]:
user['retweet_ratio'].isnull().sum()

0

In [162]:
url_count = pd.DataFrame(graph.groupby('source_user_id')['urls_count'].sum())
url_count['source_user_id'] = url_count.index
url_count.reset_index(drop = True, inplace = True)
url_count.columns = ['url_count_tweets', 'source_user_id']
user = pd.merge(user, url_count, on = 'source_user_id', how = 'left')



In [163]:
user['url_ratio'] = (user['url.urls'] + user["url_count_tweets"]) / user['tweet_count']

In [164]:
user['url_ratio'] = user.groupby('label')['url_ratio'].transform(lambda x: x.fillna(x.mean()))

user['url_ratio'].isnull().sum()

0

In [165]:
url_max = pd.DataFrame(graph.groupby('source_user_id')['urls_count'].max())

url_max['source_user_id'] = url_max.index
url_max.reset_index(drop = True, inplace = True)
url_max.columns = ['url_max_tweets', 'source_user_id']
user = pd.merge(user, url_max, on = 'source_user_id', how = 'left')

user['url_max_tweets'] = user.groupby('label')['url_max_tweets'].transform(lambda x: x.fillna(x.mean())) 

user['url_max_tweets'].isnull().sum()

0

In [166]:
def tweets_standard_deviation(s):
    if len(s) < 2:
        return np.nan
    time_diffs = s.diff().dt.total_seconds()[1:]
    mean_time_diff = time_diffs.mean()
    
    sd = (time_diffs - mean_time_diff) ** 2
    
    mean = np.sqrt(sd.mean()) / 86400
    
    return mean


tweet_time_sd = pd.DataFrame(graph.groupby('source_user_id')['created_at_ts'].agg(tweets_standard_deviation))
tweet_time_sd['source_user_id'] = tweet_time_sd.index
tweet_time_sd.reset_index(drop = True, inplace = True)
tweet_time_sd.columns = ['time_interval_sd_day', 'source_user_id']

user = pd.merge(user, tweet_time_sd, on = 'source_user_id', how = 'left')

user['time_interval_sd_day'] = user.groupby('label')['time_interval_sd_day'].transform(lambda x: x.fillna(x.mean())) 

user['time_interval_sd_day'].isnull().sum()

0

In [167]:
tweets_2.columns

Index(['Unnamed: 0', 'source_user_id', 'mention_count', 'hashtag_count',
       'symbols_count', 'urls_count', 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'text', 'created_at', 'conversation_id',
       'tweet_id', 'relationship', 'target_user_id'],
      dtype='object')

In [168]:
mention_count = pd.DataFrame(graph.groupby('source_user_id')['mention_count'].sum())

mention_count['source_user_id'] = mention_count.index
mention_count.reset_index(drop = True, inplace = True)
mention_count.columns = ['mention_count_tweets', 'source_user_id']
user = pd.merge(user, mention_count, on = 'source_user_id', how = 'left')

user['mention_count_tweets'] = user.groupby('label')['mention_count_tweets'].transform(lambda x: x.fillna(x.mean())) 

user['url_max_tweets'].isnull().sum()

0

In [169]:
mention_max = pd.DataFrame(graph.groupby('source_user_id')['mention_count'].max())

mention_max['source_user_id'] = mention_max.index
mention_max.reset_index(drop = True, inplace = True)
mention_max.columns = ['mention_max_tweets', 'source_user_id']
user = pd.merge(user, mention_max, on = 'source_user_id', how = 'left')

user['mention_max_tweets'] = user.groupby('label')['mention_max_tweets'].transform(lambda x: x.fillna(x.mean())) 

user['mention_max_tweets'].isnull().sum()

0

In [174]:
graph['hashtag_count'] = graph['hashtag_count'].apply(lambda x: int(float((str(x).replace("t", "")))))

In [175]:
hashtag_count = pd.DataFrame(graph.groupby('source_user_id')['hashtag_count'].sum())

hashtag_count['source_user_id'] = hashtag_count.index
hashtag_count.reset_index(drop = True, inplace = True)
hashtag_count.columns = ['hashtag_count_tweets', 'source_user_id']
user = pd.merge(user, hashtag_count, on = 'source_user_id', how = 'left')

user['hashtag_count_tweets'] = user.groupby('label')['hashtag_count_tweets'].transform(lambda x: x.fillna(x.mean())) 

user['hashtag_count_tweets'].isnull().sum()

0

In [176]:
hashtag_max = pd.DataFrame(graph.groupby('source_user_id')['hashtag_count'].max())
hashtag_max['source_user_id'] = hashtag_max.index
hashtag_max.reset_index(drop = True, inplace = True)
hashtag_max.columns = ['hashtag_max_tweets', 'source_user_id']
user = pd.merge(user, hashtag_max, on = 'source_user_id', how = 'left')

user['hashtag_max_tweets'] = user.groupby('label')['hashtag_max_tweets'].transform(lambda x: x.fillna(x.mean())) 

user['hashtag_max_tweets'].isnull().sum()

0

In [186]:
tweet_counts = graph.groupby('source_user_id').size().reset_index(name = 'tweet_count')

graph['tweet_length'] = graph['text'].str.len()

average_length = graph.groupby('source_user_id')['tweet_length'].mean().reset_index(name = 'avg_tweet_length')

def contains(arr):
    if isinstance(arr, list):
        return len(arr) > 0
    return False

url_tweets = graph[graph['urls_count'] > 0].groupby('source_user_id').size().reset_index(name = 'url_tweet_count')

In [187]:
user = pd.merge(user, tweet_counts, on = 'source_user_id', how = 'left')
user = pd.merge(user, average_length, on = 'source_user_id', how = 'left')
user = pd.merge(user, url_tweets, on = 'source_user_id', how = 'left')


In [188]:
user.rename(columns = {'tweet_count_y': 'tweet_count'}, inplace = True)
user = user.drop("tweet_count", axis = 1)
user = user.rename({'tweet_count_x': 'tweet_count'}, axis = 1)

In [189]:
cols = ['avg_tweet_length', 'url_count_tweets', 'url_tweet_count']

for col in cols:
    user[col] = user.groupby('label')[col].transform(lambda x: x.fillna(x.mean()))

In [190]:
user.isnull().sum()

created_at                   0
description                  0
id_x                         0
location                 37699
name                         0
pinned_tweet_id          95680
profile_image_url            0
protected                    0
url                          0
username                     0
verified                     0
withheld                189077
url.urls                     0
description.urls             0
description.mentions         0
description.hashtags         0
description.cashtags         0
followers_count              0
following_count              0
tweet_count                  0
listed_count                 0
source_user_id               0
created_at_ts                0
username_length              0
name_length                  0
description_length           0
num_digits_username          0
username_entropy             0
description_entropy          0
names_similarity             0
names_ratio                  0
reputation                   0
account_

In [192]:
graph['text'] = graph['text'].astype(str)

In [193]:
nltk.download('wordnet')
nltk.download('stopwords')

tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweets(tweet):
    tweet_cleaned = re.sub(r'https?://[^ ]+', '', tweet)
    tweet_cleaned = re.sub(r'@\w+', '', tweet_cleaned)
    tweet_cleaned = re.sub(r'RT', '', tweet_cleaned)
    tweet_cleaned = tweet_cleaned.lower()
    tweet_cleaned = re.sub(r'[^\w\s]', '', tweet_cleaned)

    tokens = tokenizer.tokenize(tweet_cleaned)

    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return tokens

graph['tokens'] = graph['text'].apply(preprocess_tweets)


[nltk_data] Downloading package wordnet to /home/bae9wk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bae9wk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [194]:
glove_model = api.load('glove-twitter-100')

def get_embeddings(tweet_tokens, glove_model):
    
    embeddings = [glove_model[token] for token in tweet_tokens if token in glove_model]
    
    if not embeddings:
        return None
    
    tweet_embedding = sum(embeddings) / len(embeddings)
    return tweet_embedding


In [195]:
graph['tweet_embedding'] = graph['tokens'].apply(lambda x: get_embeddings(x, glove_model))

In [196]:
graph[['tweet_embedding', 'text']][:5]

Unnamed: 0,tweet_embedding,text
0,,@KalengaKamwendo About that 🤦‍♂️
1,"[0.1034311, 0.2273521, 0.26947024, -0.19995345...",RT @afcstuff: Mikel Arteta on the late winner:...
2,"[0.09070051, 0.24574916, 0.3635225, -0.2007760...",RT @premierleague: GOAL Southampton 1-0 Norwic...
3,"[0.52692664, 0.20889334, -0.34069332, -0.13844...",@neurodruid Read the article. It's included in...
4,,https://t.co/A7CkZRpvdr


In [197]:
cols_to_keep = ["source_user_id", "target_user_id", "relationship", "tweet_embedding"]
graph_filtered = graph[cols_to_keep]

In [198]:
graph_filtered = graph_filtered.drop(graph_filtered[graph_filtered["target_user_id"].isnull()].index, axis = 0)

In [199]:
user = user.drop(['id_x', 'id_y'], axis = 1)

In [200]:
cols = ['reputation', 'retweet_ratio', 'url_count_tweets', 'url_ratio', 'url_max_tweets', 'time_interval_sd_day', 'mention_count_tweets',
       'hashtag_count_tweets', 'hashtag_max_tweets', 'account_age']

for col in cols:
    user[col] = (user[col] - user[col].min()) / (user[col].max() - user[col].min())


In [201]:
user = user.drop(['description', 'created_at', 'location', 'name', 'profile_image_url', 'url', 'username'], axis = 1)


In [202]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189091 entries, 0 to 189090
Data columns (total 41 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   pinned_tweet_id       93411 non-null   float64            
 1   protected             189091 non-null  bool               
 2   verified              189091 non-null  bool               
 3   withheld              14 non-null      object             
 4   url.urls              189091 non-null  int64              
 5   description.urls      189091 non-null  int64              
 6   description.mentions  189091 non-null  int64              
 7   description.hashtags  189091 non-null  int64              
 8   description.cashtags  189091 non-null  int64              
 9   followers_count       189091 non-null  int64              
 10  following_count       189091 non-null  int64              
 11  tweet_count           189091 non-null  int64        

In [204]:
user = user.drop(['pinned_tweet_id', 'withheld', 'protected', 'verified'], axis = 1)

In [205]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189091 entries, 0 to 189090
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   url.urls              189091 non-null  int64              
 1   description.urls      189091 non-null  int64              
 2   description.mentions  189091 non-null  int64              
 3   description.hashtags  189091 non-null  int64              
 4   description.cashtags  189091 non-null  int64              
 5   followers_count       189091 non-null  int64              
 6   following_count       189091 non-null  int64              
 7   tweet_count           189091 non-null  int64              
 8   listed_count          189091 non-null  int64              
 9   source_user_id        189091 non-null  int64              
 10  created_at_ts         189091 non-null  datetime64[ns, UTC]
 11  username_length       189091 non-null  int64        

In [206]:
graph_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1185051 entries, 1 to 1836771
Data columns (total 4 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   source_user_id   1185051 non-null  int64 
 1   target_user_id   1185051 non-null  object
 2   relationship     1185051 non-null  object
 3   tweet_embedding  1088761 non-null  object
dtypes: int64(1), object(3)
memory usage: 45.2+ MB


In [None]:
graph = graph_filtered

In [211]:
problematic_rows = []
successful_matches = []

for index, row in tqdm(graph.iterrows(), total=len(graph)):
    try:
        # Convert string representation of list to actual list of integers
        target_id_str = row['target_user_id']
        
        if isinstance(target_id_str, str):
            # Using ast.literal_eval to safely convert string representation of list to actual list
            target_ids = ast.literal_eval(target_id_str)
            # Convert to list if it's not already (handles single integer case)
            if not isinstance(target_ids, list):
                target_ids = [target_ids]
        else:
            target_ids = target_id_str if isinstance(target_id_str, list) else [target_id_str]
        
        # Convert all elements to integers
        target_ids = [int(x) for x in target_ids if pd.notna(x)]
        
        # Check if any of the target_ids exist in user['source_user_id']
        matching_users = user[user['source_user_id'].isin(target_ids)]
        
        if not matching_users.empty:
            successful_matches.append(index)
        else:
            problematic_rows.append(index)
            
    except Exception as e:
        print(f"Error processing row {index}: {str(e)}")
        problematic_rows.append(index)

print(f"Successfully matched rows: {len(successful_matches)}")
print(f"Problematic rows: {len(problematic_rows)}")

# Let's verify with a test case
test_row = graph.iloc[0]
print(f"\nTest case verification:")
print(f"Original value: {test_row['target_user_id']}")
test_ids = ast.literal_eval(test_row['target_user_id']) if isinstance(test_row['target_user_id'], str) else test_row['target_user_id']
print(f"Converted value: {test_ids}")
print(f"Type of converted value: {type(test_ids)}")
matches = user[user['source_user_id'].isin(test_ids)]
print(f"Matches found: {len(matches)}")

1582920	/scratch/bae9wk/datasets/TwiBot22/graph_final.csv


In [None]:
graph = graph.drop(problematic_rows)

In [None]:
graph_filtered.to_csv('{dirpath}/graph_cleaned.csv', index=False)
user.to_csv('{dirpath}/user_final.csv', index=False)