## Setup

In [1]:
import networkx as nx
import numpy as np
import pandas as pd

## Graph

In [4]:
G = nx.read_graphml("../data/users_clean.graphml")

In [8]:
# Print descriptive statistics such as number of nodes, edges, etc.
print(
    f"The graph contains {G.number_of_nodes()} nodes and {G.number_of_edges()} edges."
)

The graph contains 100386 nodes and 2286592 edges.


## Node attributes

In [9]:
df = pd.read_csv("../data/users_neighborhood_anon.csv", index_col=0)

For a description of features see https://www.kaggle.com/datasets/manoelribeiro/hateful-users-on-twitter

We only keep the features that represent attributes from the Twitter profile and from the text of the tweets, i.e. we remove the features that can be computed from the graph. Specifically, we do not consired the features aggregated from neighbors, since we will use Graph Machine Learning methods that, hopefully, extract the same kind of information in better ways. We also do not consider the features that are computed from the graph structure, such as centrality measures, since we will extract them ourselves later.

In [36]:
columns = list(df.columns)

features = [
    "user_id",
    "hate",
    # "hate_neigh",
    # "normal_neigh",
    "statuses_count",
    "followers_count",
    "followees_count",
    "favorites_count",
    "listed_count",
    # "betweenness",
    # "eigenvector",
    # "in_degree",
    # "out_degree",
    "sentiment",
    "subjectivity",
    "number hashtags",
    # "hashtags",
    "tweet number",
    "retweet number",
    "quote number",
    "status length",
    "number urls",
    "baddies",
    "mentions",
    # "is_50",
    # "is_63",
    # "is_50_2",
    # "is_63_2",
    "time_diff",
    "time_diff_median",
    "created_at",
]

features += list(
    filter(lambda x: x.endswith("_empath") and not x.startswith("c_"), columns)
)
features += list(
    filter(lambda x: x.endswith("_glove") and not x.startswith("c_"), columns)
)

In [37]:
df = df[features]
df

Unnamed: 0,user_id,hate,statuses_count,followers_count,followees_count,favorites_count,listed_count,sentiment,subjectivity,number hashtags,...,290_glove,291_glove,292_glove,293_glove,294_glove,295_glove,296_glove,297_glove,298_glove,299_glove
0,0,normal,101767,3504,3673,81635,53,0.035132,0.431656,16.0,...,-0.162803,0.022277,-0.061740,-0.108736,0.095297,0.014048,-0.079171,-0.064034,0.039573,0.121562
1,1,other,2352,19609,309,61,197,0.088142,0.418649,40.0,...,-0.220041,0.061212,0.045925,-0.090857,0.033579,-0.038353,-0.068932,-0.076778,0.016700,0.081580
2,2,other,1044,2371,2246,561,16,0.117861,0.455828,328.0,...,-0.071102,0.044288,0.044124,-0.058129,-0.026722,-0.034385,-0.015848,-0.033579,-0.015725,0.110611
3,3,other,167172,3004,298,3242,53,0.261688,0.558544,127.0,...,-0.085447,0.039014,0.014094,-0.090334,0.155788,-0.006964,-0.071013,-0.043521,0.023655,0.126403
4,4,other,1998,17643,19355,485,239,0.121533,0.435334,1710.0,...,-0.162940,0.112401,-0.109144,-0.143494,-0.015770,0.184437,0.015669,0.074143,-0.107558,-0.102976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100381,100381,other,6425,70081,551,1770,223,0.174331,0.511019,241.0,...,-0.101585,0.056177,-0.011879,-0.079700,0.057477,-0.037915,-0.044164,-0.029799,-0.056733,0.108609
100382,100382,other,16928,62727,463,52132,100,0.227946,0.571372,69.0,...,-0.048893,-0.034731,-0.102936,-0.107279,0.135280,-0.022714,-0.062393,-0.037941,-0.016274,0.070525
100383,100383,other,1118,12885,29,0,598,0.182165,0.446904,143.0,...,-0.139124,0.025163,0.048279,-0.062656,0.027198,-0.054500,-0.013992,-0.046172,-0.037651,0.112789
100384,100384,other,3359,517,137,595,17,0.116135,0.541092,97.0,...,-0.140663,0.020672,-0.031872,-0.104729,0.103883,-0.035841,-0.071515,-0.048701,0.055673,0.084698


In [39]:
df.hate.value_counts()

hate
other      95415
normal      4427
hateful      544
Name: count, dtype: int64