# Natural Language Processing analysis of Mastodon's servers' rules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from utils.SocialMediaDataset import MastodonDataset, RedditDataset
from utils.rules_extractor import RulesProcessor
import utils.utils as utils

## Data cleaning and preprocessing

In [3]:
mastodon = MastodonDataset()
reddit = RedditDataset()

In [4]:
mastodon.df

Unnamed: 0,domain,title,description,active_month,languages,rules,total_users
0,mastodon.social,Mastodon,The original server operated by the Mastodon g...,327503,"[""en""]","[{""id"": ""1"", ""text"": ""Sexually explicit or vio...",2711879.0
1,mstdn.social,Mastodon 🐘,A general-purpose Mastodon server with a 500 c...,14546,"[""en""]","[{""id"": ""2"", ""text"": ""Sexually explicit or vio...",260323.0
2,infosec.exchange,Infosec Exchange,A Mastodon instance for info/cyber security-mi...,13319,"[""en""]","[{""id"": ""1"", ""text"": ""Do unto others as you th...",75725.0
3,mas.to,mas.to,"Hello! mas.to is a fast, up-to-date and fun Ma...",11889,"[""en""]","[{""id"": ""9"", ""text"": ""No discrimination, inclu...",183608.0
4,piaille.fr,Piaille,Piaille.fr est un serveur mastodon public fran...,9221,"[""fr""]","[{""id"": ""1"", ""text"": ""Conformément aux lois fr...",40869.0
...,...,...,...,...,...,...,...
135,ceilidh.online,Ceilidh Online,Ceilidh.Online is meant to be a safe gathering...,11,"[""en""]","[{""id"": ""2"", ""text"": ""Do not engage in hate sp...",42.0
136,mastodon.bahia.no,Bahia.no - A Bahia no Mastodon!,"Bahia.no - aqui, todo mundo é baiano! Se você ...",10,"[""pt-BR""]","[{""id"": ""1"", ""text"": ""Idiomas permitidos: Baia...",15.0
137,dariox.club,DARiOX,A safe space for tech-centered LGBTQI+ folks a...,9,"[""en""]","[{""id"": ""2"", ""text"": ""No illegal content"", ""hi...",150.0
138,mastodon.mg,Mastodon Madagascar,La première instance malgache de Mastodon.,6,"[""fr""]","[{""id"": ""1"", ""text"": ""Cette instance étant héb...",113.0


In [5]:
reddit.df

Unnamed: 0,name,title,description,language,subscribers,active_user_count,rules
0,Home,Home,,en,307843,51,
1,AskReddit,Ask Reddit...,r/AskReddit is the place to ask and answer tho...,es,54701432,9986,Rule 1 - Questions must be clear and direct an...
2,NoStupidQuestions,No such thing as stupid questions,Ask away!\n\nDisclaimer: This is an anonymous ...,en,6021303,4872,Top level comments must contain a genuine huma...
3,BaldursGate3,Baldur's Gate 3,"A community all about Baldur's Gate III, the r...",en,3118378,1384,Be civil to one another.; Respect the opinions...
4,facepalm,now double verified,/r/facepalm - please sir can I have some more?,en,8145733,2345,"No uncivil, bigoted, misogynist, misandrist, r..."
...,...,...,...,...,...,...,...
95,SteamDeck,Steam Deck,The Unofficial Subreddit for the Valve Steam D...,en,903284,446,Be Kind Or Get Banned; Posts must be about or ...
96,college,College,The subreddit for discussion related to colleg...,en,2911405,76,Do not post spam or surveys.; Do not post anyt...
97,manga,"/r/manga: manga, on reddit.",Everything and anything manga! (manhwa/manhua...,en,4725267,1480,Disrespectful; Follow submission guidelines wh...
98,CrazyFuckingVideos,CrazyFuckingVideos,Crazy fucking videos for your viewing pleasure,en,2240152,799,Follow Reddit's TOS; Be civil; Must be a Crazy...


In [6]:
mastodon.clean()
reddit.clean()


Starting data cleaning process... 🧹🧼
Cleaning column: description
Cleaning column: active_month
Cleaning column: languages
Cleaning column: rules
Cleaning column: total_users
Data is all clean and shiny! ✨🫧

Starting data cleaning process... 🧹🧼
Cleaning column: description
Cleaning column: languages
Cleaning column: total_users
Cleaning column: active_month
Cleaning column: rules
Data is all clean and shiny! ✨🫧


In [7]:
df_mastodon = mastodon.df
df_mastodon

Unnamed: 0,domain,title,description,active_month,languages,rules,total_users
0,mastodon.social,Mastodon,The original server operated by the Mastodon g...,327503,[en],"[{'id': '1', 'text': 'Sexually explicit or vio...",2711879
1,mstdn.social,Mastodon 🐘,A general-purpose Mastodon server with a 500 c...,14546,[en],"[{'id': '2', 'text': 'Sexually explicit or vio...",260323
2,infosec.exchange,Infosec Exchange,A Mastodon instance for info/cyber security-mi...,13319,[en],"[{'id': '1', 'text': 'Do unto others as you th...",75725
3,mas.to,mas.to,"Hello! mas.to is a fast, up-to-date and fun Ma...",11889,[en],"[{'id': '9', 'text': 'No discrimination, inclu...",183608
4,piaille.fr,Piaille,Piaille.fr est un serveur mastodon public fran...,9221,[fr],"[{'id': '1', 'text': 'Conformément aux lois fr...",40869
...,...,...,...,...,...,...,...
135,ceilidh.online,Ceilidh Online,Ceilidh.Online is meant to be a safe gathering...,11,[en],"[{'id': '2', 'text': 'Do not engage in hate sp...",42
136,mastodon.bahia.no,Bahia.no - A Bahia no Mastodon!,"Bahia.no - aqui, todo mundo é baiano! Se você ...",10,[pt-BR],"[{'id': '1', 'text': 'Idiomas permitidos: Baia...",15
137,dariox.club,DARiOX,A safe space for tech-centered LGBTQI+ folks a...,9,[en],"[{'id': '2', 'text': 'No illegal content', 'hi...",150
138,mastodon.mg,Mastodon Madagascar,La première instance malgache de Mastodon.,6,[fr],"[{'id': '1', 'text': 'Cette instance étant héb...",113


In [8]:
df_reddit = reddit.df
df_reddit

Unnamed: 0,domain,title,description,languages,total_users,active_month,rules
0,Home,Home,,en,307843,51,[]
1,AskReddit,Ask Reddit...,r/AskReddit is the place to ask and answer tho...,es,54701432,9986,[Rule 1 - Questions must be clear and direct a...
2,NoStupidQuestions,No such thing as stupid questions,Ask away!\n\nDisclaimer: This is an anonymous ...,en,6021303,4872,[Top level comments must contain a genuine hum...
3,BaldursGate3,Baldur's Gate 3,"A community all about Baldur's Gate III, the r...",en,3118378,1384,"[Be civil to one another., Respect the opinio..."
4,facepalm,now double verified,/r/facepalm - please sir can I have some more?,en,8145733,2345,"[No uncivil, bigoted, misogynist, misandrist, ..."
...,...,...,...,...,...,...,...
95,SteamDeck,Steam Deck,The Unofficial Subreddit for the Valve Steam D...,en,903284,446,"[Be Kind Or Get Banned, Posts must be about o..."
96,college,College,The subreddit for discussion related to colleg...,en,2911405,76,"[Do not post spam or surveys., Do not post an..."
97,manga,"/r/manga: manga, on reddit.",Everything and anything manga! (manhwa/manhua...,en,4725267,1480,"[Disrespectful, Follow submission guidelines ..."
98,CrazyFuckingVideos,CrazyFuckingVideos,Crazy fucking videos for your viewing pleasure,en,2240152,799,"[Follow Reddit's TOS, Be civil, Must be a Cr..."


In [9]:
display(utils.compare_languages(df_mastodon, df_reddit))

Unnamed: 0,mastodon,reddit
0,'en' : 94,'en' : 99
1,'de' : 16,'es' : 1
2,'fr' : 7,
3,'ko' : 3,
4,'es' : 3,
5,'nl' : 2,
6,'it' : 2,
7,'he' : 1,
8,'gd' : 1,
9,'pl' : 1,


In [10]:
df_mastodon_en = df_mastodon[df_mastodon['languages'].apply(lambda x: x == ['en'] if isinstance(x, list) else False)]
df_reddit_en =  df_reddit[df_reddit['languages'].apply(lambda x: x == 'en' if isinstance(x, str) else False)]

In [11]:
df_mastodon_en

Unnamed: 0,domain,title,description,active_month,languages,rules,total_users
0,mastodon.social,Mastodon,The original server operated by the Mastodon g...,327503,[en],"[{'id': '1', 'text': 'Sexually explicit or vio...",2711879
1,mstdn.social,Mastodon 🐘,A general-purpose Mastodon server with a 500 c...,14546,[en],"[{'id': '2', 'text': 'Sexually explicit or vio...",260323
2,infosec.exchange,Infosec Exchange,A Mastodon instance for info/cyber security-mi...,13319,[en],"[{'id': '1', 'text': 'Do unto others as you th...",75725
3,mas.to,mas.to,"Hello! mas.to is a fast, up-to-date and fun Ma...",11889,[en],"[{'id': '9', 'text': 'No discrimination, inclu...",183608
5,hachyderm.io,Hachyderm.io,"Hachyderm is a safe space, LGBTQIA+ and BLM, p...",8937,[en],"[{'id': '1', 'text': 'Don't be a dick.', 'hint...",56318
...,...,...,...,...,...,...,...
133,mcr.wtf,mcr.wtf 🐝 Mastodon for Manchester,A community open to all whilst intended for pe...,14,[en],"[{'id': '1', 'text': 'No discrimination, inclu...",157
134,nfld.me,Mastodon Newfoundland and Labrador,Newfoundland and Labrador's Mastodon Server,12,[en],"[{'id': '1', 'text': 'No hate speech. We have ...",73
135,ceilidh.online,Ceilidh Online,Ceilidh.Online is meant to be a safe gathering...,11,[en],"[{'id': '2', 'text': 'Do not engage in hate sp...",42
137,dariox.club,DARiOX,A safe space for tech-centered LGBTQI+ folks a...,9,[en],"[{'id': '2', 'text': 'No illegal content', 'hi...",150


## Mastodon dataset

In [12]:
rules = df_mastodon_en[['rules']].explode('rules').reset_index(drop=False)
rules = rules.rename(columns={"index": "server_id"})
rules = rules.dropna()
rules

Unnamed: 0,server_id,rules
0,0,"{'id': '1', 'text': 'Sexually explicit or viol..."
1,0,"{'id': '2', 'text': 'No racism, sexism, homoph..."
2,0,"{'id': '3', 'text': 'No incitement of violence..."
3,0,"{'id': '4', 'text': 'No harassment, block evas..."
4,0,"{'id': '7', 'text': 'Do not share information ..."
...,...,...
803,139,"{'id': '77', 'text': '③ 同意呜呜站的社区规则？/ Do you ag..."
804,139,"{'id': '78', 'text': '🌸 缺项或错误将被拒绝 ... / Incomp..."
805,139,"{'id': '79', 'text': '详细社区规则见： / For detailed ..."
806,139,"{'id': '80', 'text': 'https://wxw.moe/about', ..."


In [233]:
# The following line explodes the initial format of mastodon's servers rules:
# eg: {“id”: “rule_id”, “text”: “rule text”, "hint": "hint text"} -> pd.Series(cols=[id, text, hint])
rules = pd.concat([rules.drop(['rules'], axis=1), rules['rules'].apply(pd.Series)], axis=1)

rules = rules.rename(columns={'id': "rule_id"})
rules

Unnamed: 0,server_id,rule_id,text,hint
0,0,1,Sexually explicit or violent media must be mar...,This includes content that is particularly pro...
1,0,2,"No racism, sexism, homophobia, transphobia, ab...",Transphobic behavior such as intentional misge...
2,0,3,No incitement of violence or promotion of viol...,Calling for people or groups to be assassinate...
3,0,4,"No harassment, block evasion, dogpiling, or do...",Repeat attempts to communicate with users who ...
4,0,7,Do not share information widely-known to be fa...,False and misleading information and links fro...
...,...,...,...,...
803,139,77,③ 同意呜呜站的社区规则？/ Do you agree with the community...,
804,139,78,🌸 缺项或错误将被拒绝 ... / Incomplete or nonsensical re...,
805,139,79,"详细社区规则见： / For detailed community rules, pleas...",
806,139,80,https://wxw.moe/about,


In [234]:
# remove rules that are not written in english
df_english = rules[rules['text'].apply(utils.is_english)].reset_index(drop=True)

non_english_rules_pourcentage = 100 * (rules.shape[0] - df_english.shape[0]) / rules.shape[0]

print(f" We removed {rules.shape[0] - df_english.shape[0]} of the {rules.shape[0]} rules ({non_english_rules_pourcentage:.0f}%) that were not detected to be in english.")

 We removed 219 of the 807 rules (27%) that were not detected to be in english.


In [235]:
# Detect the language of the description among the supposedly engish servers
server_descr_english = df_mastodon_en[df_mastodon_en['description'].apply(utils.is_english)].index
server_descr_not_english = df_mastodon_en[~df_mastodon_en['description'].apply(utils.is_english)].index

# Count them
print(f"{len(server_descr_english)} servers have their description in english;")
print(f"{len(server_descr_not_english)} servers have their description in another language.")

# Display the rules of the servers that have their description in another language than english
suspicious_server_ids = [serv_id for serv_id in df_english.server_id.unique() if serv_id in server_descr_not_english]
df_english[df_english['server_id'].isin(suspicious_server_ids)]

78 servers have their description in english;
15 servers have their description in another language.


Unnamed: 0,server_id,rule_id,text,hint
143,31,12,Treat others with respect,"Avoid any form of insult, bullying, or discrim..."
144,31,15,Comply with applicable law,Do not publish any illegal content.
145,31,16,Avoid misinformation,Do not share false or misleading information.
146,31,19,Mark sensitive content (CW),Please mark any content that may contain commo...
147,31,20,You must be older than 16 years,"According to European law, you must be at leas..."
152,33,3,No al machismo,
153,33,9,No ai bot*,"* in generale è una regola da tenere presente,..."
363,93,21,Don't be an asshole. You have an opinion? That...,
364,93,23,"Harassment, stalking, doxxing, transmisogyny, ...",
365,93,25,"Violent nationalist propaganda, Nazi symbolism...",


In [236]:
# We decide to remove 4 servers based on their rules language
manually_excluded = [52, 180, 230, 329] #manually inspected their rules and decide to remove the server since most of their rules contained words not in english.
df_english = df_english[~df_english['server_id'].isin(manually_excluded)]
df_english

Unnamed: 0,server_id,rule_id,text,hint
0,0,1,Sexually explicit or violent media must be mar...,This includes content that is particularly pro...
1,0,3,No incitement of violence or promotion of viol...,Calling for people or groups to be assassinate...
2,0,4,"No harassment, block evasion, dogpiling, or do...",Repeat attempts to communicate with users who ...
3,0,7,Do not share information widely-known to be fa...,False and misleading information and links fro...
4,0,1008,"Content created by others must be attributed, ...",Content created by others must clearly provide...
...,...,...,...,...
583,139,76,② 简答 ACGN 的含义？/ What does ACGN stand for?,
584,139,77,③ 同意呜呜站的社区规则？/ Do you agree with the community...,
585,139,78,🌸 缺项或错误将被拒绝 ... / Incomplete or nonsensical re...,
586,139,79,"详细社区规则见： / For detailed community rules, pleas...",


In [237]:
# Process te rules for NLP
processor = RulesProcessor(df_mastodon_en)
rules_df = processor.extract_rules()
standardized_df = processor.standardize_rules()

# Compute strictness metrics TODO: define metrics and compute them
standardized_df["strict_rule"] = standardized_df["text"].apply(utils.contains_strict_words)
standardized_df

Unnamed: 0,server_id,rule_id,text,hint,strict_rule
0,0,1,"[sexually, explicit, or, violent, media, must,...","[this, includes, content, that, is, particular...",False
1,0,2,"[no, racism, sexism, homophobia, transphobia, ...","[transphobic, behavior, such, as, intentional,...",True
2,0,3,"[no, incitement, of, violence, or, promotion, ...","[calling, for, people, or, groups, to, be, ass...",True
3,0,4,"[no, harassment, block, evasion, dogpiling, or...","[repeat, attempts, to, communicate, with, user...",True
4,0,7,"[do, not, share, information, widely, known, t...","[false, and, misleading, information, and, lin...",False
...,...,...,...,...,...
803,139,77,"[do, you, agree, with, the, community, rules, ...",[],False
804,139,78,"[incomplete, or, nonsensical, responses, will,...",[],False
805,139,79,"[for, detailed, community, rules, please, see]",[],False
806,139,80,"[https, wxw, moe, about]",[],False


In [238]:
# Keep only the rows from standardized_df that have a matching server_id and rule_id in df_english
standardized_df = standardized_df.merge(df_english[["server_id", "rule_id"]], on=["server_id", "rule_id"], how='inner')
standardized_df

Unnamed: 0,server_id,rule_id,text,hint,strict_rule
0,0,1,"[sexually, explicit, or, violent, media, must,...","[this, includes, content, that, is, particular...",False
1,0,3,"[no, incitement, of, violence, or, promotion, ...","[calling, for, people, or, groups, to, be, ass...",True
2,0,4,"[no, harassment, block, evasion, dogpiling, or...","[repeat, attempts, to, communicate, with, user...",True
3,0,7,"[do, not, share, information, widely, known, t...","[false, and, misleading, information, and, lin...",False
4,0,1008,"[content, created, by, others, must, be, attri...","[content, created, by, others, must, clearly, ...",False
...,...,...,...,...,...
580,139,76,"[acgn, what, does, acgn, stand, for]",[],False
581,139,77,"[do, you, agree, with, the, community, rules, ...",[],False
582,139,78,"[incomplete, or, nonsensical, responses, will,...",[],False
583,139,79,"[for, detailed, community, rules, please, see]",[],False


In [239]:
# Natural language processing
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download stopwords if not already done
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english')).union(set(stopwords.words('french'))).union(set(stopwords.words('german'))).union(set(stopwords.words('spanish')))

# Remove stop words from the tokenized lists
standardized_df["text"] = standardized_df["text"].apply(lambda words: [word for word in words if word not in stop_words] if isinstance(words, list) else words)
standardized_df["hint"] = standardized_df["hint"].apply(lambda words: [word for word in words if word not in stop_words] if isinstance(words, list) else words)

# Lemmatize tokens
# No need to lemmatize for berTOPIC
#standardized_df["text"] = standardized_df["text"].apply(lemmatize)
#standardized_df["hint"] = standardized_df["hint"].apply(lemmatize)

standardized_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eglantinevialaneix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,server_id,rule_id,text,hint,strict_rule
0,0,1,"[sexually, explicit, violent, media, must, mar...","[includes, content, particularly, provocative,...",False
1,0,3,"[incitement, violence, promotion, violent, ide...","[calling, people, groups, assassinated, murder...",True
2,0,4,"[harassment, block, evasion, dogpiling, doxxin...","[repeat, attempts, communicate, users, blocked...",True
3,0,7,"[share, information, widely, known, false, mis...","[false, misleading, information, links, low, q...",False
4,0,1008,"[content, created, others, must, attributed, u...","[content, created, others, must, clearly, prov...",False
...,...,...,...,...,...
580,139,76,"[acgn, acgn, stand]",[],False
581,139,77,"[agree, community, rules, instance]",[],False
582,139,78,"[incomplete, nonsensical, responses, result, r...",[],False
583,139,79,"[detailed, community, rules, please, see]",[],False


In [240]:
#tokenize each text and each hint
nlp_df = standardized_df.copy()

nlp_df["document"] = nlp_df.apply(utils.create_document, axis=1)

vocab = nlp_df["document"].explode().unique()
nlp_df["document"].explode().value_counts()

document
content        178
must            77
promotion       56
violent         52
accounts        48
              ... 
borderline       1
construed        1
minority         1
size             1
registering      1
Name: count, Length: 1631, dtype: int64

In [241]:
# TF-IDF scores
# Join the tokens back into strings
nlp_df['document_str'] = nlp_df['document'].apply(lambda x: ' '.join(x))

# Create and fit TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer()
tfidf_matrix = tfidf_vect.fit_transform(nlp_df['document_str'])

# Get feature names (words)
feature_names = tfidf_vect.get_feature_names_out()

tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
    

TF-IDF matrix shape: (585, 1624)


In [242]:
# Display top terms with highest average TF-IDF scores
mean_tfidf = tfidf_matrix.mean(axis=0).A1
term_scores = pd.DataFrame({'term': feature_names, 'score': mean_tfidf})
print("Top 10 terms by average TF-IDF score:")
print(term_scores.sort_values('score', ascending=False).head(10))

Top 10 terms by average TF-IDF score:
            term     score
325      content  0.049174
1570     violent  0.028964
1140   promotion  0.028426
1569    violence  0.027094
949         must  0.025458
665   harassment  0.024997
1545       users  0.023781
739   incitement  0.022182
714   ideologies  0.020737
1354        spam  0.019079


In [243]:
# Display the top TF-IDF words for a few sample documents
sample_indices = np.random.choice(tfidf.index, size=5, replace=False)
for idx in sample_indices:
    print(f"Top TF-IDF words for document {idx}:")
    doc_tfidf = tfidf.iloc[idx]
    top_words = doc_tfidf.nlargest(5)
    print(top_words, '\n')

Top TF-IDF words for document 165:
casteism      0.311001
semitism      0.311001
called        0.263404
conversion    0.263404
deadnaming    0.263404
Name: 165, dtype: float64 

Top TF-IDF words for document 579:
register    0.680776
want        0.559167
instance    0.473155
1044123     0.000000
13          0.000000
Name: 579, dtype: float64 

Top TF-IDF words for document 493:
annotated     0.340068
provided      0.340068
must          0.333846
original      0.319427
preferably    0.319427
Name: 493, dtype: float64 

Top TF-IDF words for document 149:
violating     0.466664
fun           0.445268
keep          0.363562
safe          0.357609
respectful    0.352098
Name: 149, dtype: float64 

Top TF-IDF words for document 54:
incite      0.372239
cause       0.349646
intended    0.321181
towards     0.311021
groups      0.302432
Name: 54, dtype: float64 



In [244]:
# Topic modeling - runs for 20 sec
from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(nlp_df['document_str'])

KeyboardInterrupt: 

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,65,-1_languages_english_servers_language,"[languages, english, servers, language, commun...",[allowed languages german english main languag...
1,0,100,0_content_sensitive_nsfw_explicit,"[content, sensitive, nsfw, explicit, media, wa...",[explicit nsfw content without content warning...
2,1,67,1_illegal_content_united_post,"[illegal, content, united, post, laws, kingdom...","[content illegal united kingdom, content illeg..."
3,2,65,2_spam_advertising_accounts_commercial,"[spam, advertising, accounts, commercial, emai...","[advertising spam excessive promotion, spam in..."
4,3,41,3_false_misleading_information_intentionally,"[false, misleading, information, intentionally...",[share intentionally false misleading informat...
5,4,37,4_violence_incitement_ideologies_promotion,"[violence, incitement, ideologies, promotion, ...",[incitement violence promotion violent ideolog...
6,5,35,5_speech_hate_hateful_sexist,"[speech, hate, hateful, sexist, slurs, homopho...",[hate speech zero tolerance policy hate speech...
7,6,26,6_respectful_respect_keep_everyone,"[respectful, respect, keep, everyone, civil, o...",[order make libretooth welcoming safe place ev...
8,7,25,7_transphobia_homophobia_racism_sexism,"[transphobia, homophobia, racism, sexism, disc...",[racism sexism homophobia transphobia xenophob...
9,8,24,8_doxxing_dogpiling_users_harassment,"[doxxing, dogpiling, users, harassment, identi...","[harassment dogpiling doxxing users, harassmen..."


## Reddit dataset

In [None]:
df_reddit_en

Unnamed: 0,domain,title,description,languages,total_users,active_month,rules
0,Home,Home,,en,307843,51,[]
2,NoStupidQuestions,No such thing as stupid questions,Ask away!\n\nDisclaimer: This is an anonymous ...,en,6021303,4872,[Top level comments must contain a genuine hum...
3,BaldursGate3,Baldur's Gate 3,"A community all about Baldur's Gate III, the r...",en,3118378,1384,"[Be civil to one another., Respect the opinio..."
4,facepalm,now double verified,/r/facepalm - please sir can I have some more?,en,8145733,2345,"[No uncivil, bigoted, misogynist, misandrist, ..."
5,interestingasfuck,Interesting As Fuck,For anything truly interesting as fuck,en,13961838,4038,"[Posts MUST be INTERESTING AS FUCK!, No Polit..."
...,...,...,...,...,...,...,...
95,SteamDeck,Steam Deck,The Unofficial Subreddit for the Valve Steam D...,en,903284,446,"[Be Kind Or Get Banned, Posts must be about o..."
96,college,College,The subreddit for discussion related to colleg...,en,2911405,76,"[Do not post spam or surveys., Do not post an..."
97,manga,"/r/manga: manga, on reddit.",Everything and anything manga! (manhwa/manhua...,en,4725267,1480,"[Disrespectful, Follow submission guidelines ..."
98,CrazyFuckingVideos,CrazyFuckingVideos,Crazy fucking videos for your viewing pleasure,en,2240152,799,"[Follow Reddit's TOS, Be civil, Must be a Cr..."


In [None]:
rules = df_reddit_en['rules'].explode().reset_index(drop=False)
rules = rules.rename(columns={"index": "server_id"})
# We add an index to the rules
rules["rule_id"] = rules.groupby("server_id").cumcount()
rules = rules.dropna()
rules

Unnamed: 0,server_id,rules,rule_id
1,2,Top level comments must contain a genuine huma...,0
2,2,Please try searching the subreddit for your q...,1
3,2,Be Nice,2
4,2,No medical advice questions,3
5,2,No trolling or joke questions,4
...,...,...,...
1040,99,No image macros/memes,8
1041,99,No crossposts,9
1042,99,No r/RoastMe,10
1043,99,Moderators Discretion,11


In [None]:
# remove rules that are not written in english
df_english = rules[rules.rules.apply(utils.is_english)].reset_index(drop=True)

non_english_rules_pourcentage = 100 * (rules.shape[0] - df_english.shape[0]) / rules.shape[0]

print(f" We removed {rules.shape[0] - df_english.shape[0]} of the {rules.shape[0]} rules ({non_english_rules_pourcentage:.0f}%) that were not detected to be in english.")

 We removed 277 of the 1044 rules (27%) that were not detected to be in english.


In [None]:
def r_remove_empty(x):
    if isinstance(x, list) and len(x) > 0:
        return [item for item in x if item != '']
    return x

def r_standardize_text(df_column):
    df_column = df_column.apply(lambda x: x.strip() if isinstance(x, str) else x)
    df_column = df_column.str.lower()
    df_column = df_column.str.replace(r"[^a-zA-Z0-9\s]", " ", regex=True)
    df_column = df_column.str.replace(r"\s+", " ", regex=True).str.strip()
    df_column = df_column.str.split(" ")
    df_column = df_column.apply(r_remove_empty)
    return df_column

In [None]:
type(df_english)

pandas.core.frame.DataFrame

In [None]:
df_english

Unnamed: 0,server_id,rules,rule_id
0,2,Top level comments must contain a genuine huma...,0
1,2,Please try searching the subreddit for your q...,1
2,2,No suicide or 'was I raped/sexual assaulted/s...,5
3,2,"No self-promotion, shilling or begging",6
4,2,No illegal/unethical or disturbing subject ma...,7
...,...,...,...
762,99,No personal information,4
763,99,No long videos (>10s) or long texts,7
764,99,No r/RoastMe,10
765,99,Moderators Discretion,11


In [None]:
# Process te rules for NLP
standardized_df = df_english.copy()
standardized_df["rules"] = r_standardize_text(standardized_df['rules'])

# Compute strictness metrics TODO: define metrics and compute them
#standardized_df["strict_rule"] = standardized_df["rules"].apply(utils.contains_strict_words)
standardized_df #["rules"]

Unnamed: 0,server_id,rules,rule_id
0,2,"[top, level, comments, must, contain, a, genui...",0
1,2,"[please, try, searching, the, subreddit, for, ...",1
2,2,"[no, suicide, or, was, i, raped, sexual, assau...",5
3,2,"[no, self, promotion, shilling, or, begging]",6
4,2,"[no, illegal, unethical, or, disturbing, subje...",7
...,...,...,...
762,99,"[no, personal, information]",4
763,99,"[no, long, videos, 10s, or, long, texts]",7
764,99,"[no, r, roastme]",10
765,99,"[moderators, discretion]",11


In [None]:
# Keep only the rows from standardized_df that have a matching server_id and rule_id in df_english
standardized_df = standardized_df.merge(df_english[["server_id", "rule_id"]], on=["server_id", "rule_id"], how='inner')
standardized_df

Unnamed: 0,server_id,rules,rule_id
0,2,"[top, level, comments, must, contain, a, genui...",0
1,2,"[please, try, searching, the, subreddit, for, ...",1
2,2,"[no, suicide, or, was, i, raped, sexual, assau...",5
3,2,"[no, self, promotion, shilling, or, begging]",6
4,2,"[no, illegal, unethical, or, disturbing, subje...",7
...,...,...,...
762,99,"[no, personal, information]",4
763,99,"[no, long, videos, 10s, or, long, texts]",7
764,99,"[no, r, roastme]",10
765,99,"[moderators, discretion]",11


In [None]:
# Natural language processing
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download stopwords if not already done
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english')).union(set(stopwords.words('french'))).union(set(stopwords.words('german'))).union(set(stopwords.words('spanish')))

# Remove stop words from the tokenized lists
standardized_df["rules"] = standardized_df["rules"].apply(lambda words: [word for word in words if word not in stop_words] if isinstance(words, list) else words)

# Lemmatize tokens
# No need to lemmatize for berTOPIC
#standardized_df["text"] = standardized_df["text"].apply(lemmatize)
#standardized_df["hint"] = standardized_df["hint"].apply(lemmatize)

standardized_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eglantinevialaneix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,server_id,rules,rule_id
0,2,"[top, level, comments, must, contain, genuine,...",0
1,2,"[please, try, searching, subreddit, question, ...",1
2,2,"[suicide, raped, sexual, assaulted, sexually, ...",5
3,2,"[self, promotion, shilling, begging]",6
4,2,"[illegal, unethical, disturbing, subject, matter]",7
...,...,...,...
762,99,"[personal, information]",4
763,99,"[long, videos, 10s, long, texts]",7
764,99,"[r, roastme]",10
765,99,"[moderators, discretion]",11


In [None]:
#tokenize each text and each hint
nlp_df = standardized_df.copy()

#nlp_df["document"] = nlp_df.apply(create_document, axis=1)

vocab = nlp_df["rules"].explode().unique()
nlp_df["rules"].explode().value_counts()

rules
posts         126
must           73
content        64
post           39
self           35
             ... 
grey            1
keys            1
reviews         1
deals           1
moderators      1
Name: count, Length: 1171, dtype: int64

In [None]:
# TF-IDF scores
# Join the tokens back into strings
nlp_df['document_str'] = nlp_df['rules'].apply(lambda x: ' '.join(x))

# Create and fit TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer()
tfidf_matrix = tfidf_vect.fit_transform(nlp_df['document_str'])

# Get feature names (words)
feature_names = tfidf_vect.get_feature_names_out()

tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
    

TF-IDF matrix shape: (767, 1157)


In [None]:
# Display top terms with highest average TF-IDF scores
mean_tfidf = tfidf_matrix.mean(axis=0).A1
term_scores = pd.DataFrame({'term': feature_names, 'score': mean_tfidf})
print("Top 10 terms by average TF-IDF score:")
print(term_scores.sort_values('score', ascending=False).head(10))

Top 10 terms by average TF-IDF score:
          term     score
778      posts  0.041990
230    content  0.027015
677       must  0.025548
912       self  0.020700
798  promotion  0.017493
607        low  0.017454
189      civil  0.016888
775       post  0.015915
341     effort  0.014231
768   politics  0.013817


In [None]:
# Display the top TF-IDF words for a few sample documents
sample_indices = np.random.choice(tfidf.index, size=5, replace=False)
for idx in sample_indices:
    print(f"Top TF-IDF words for document {idx}:")
    doc_tfidf = tfidf.iloc[idx]
    top_words = doc_tfidf.nlargest(5)
    print(top_words, '\n')

Top TF-IDF words for document 722:
accessory    0.400045
modding      0.400045
setup        0.400045
hardware     0.360151
review       0.360151
Name: 722, dtype: float64 

Top TF-IDF words for document 583:
tumblr         0.637504
posts          0.517937
screenshots    0.481146
must           0.306315
10k            0.000000
Name: 583, dtype: float64 

Top TF-IDF words for document 361:
generated      0.509268
user           0.509268
advertising    0.452856
surveys        0.443270
content        0.282358
Name: 361, dtype: float64 

Top TF-IDF words for document 659:
high        0.540310
topical     0.540310
relevant    0.442926
quality     0.383681
content     0.269695
Name: 659, dtype: float64 

Top TF-IDF words for document 104:
third    0.693675
party    0.648727
posts    0.313000
10k      0.000000
10s      0.000000
Name: 104, dtype: float64 



In [None]:
# Topic modeling
from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(nlp_df['document_str'])

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,164,-1_must_english_questions_posts,"[must, english, questions, posts, submissions,...","[top level comment must answer ask follow, pos..."
1,0,112,0_personal_witch_bigotry_harassment,"[personal, witch, bigotry, harassment, hunts, ...","[personal information witch hunting, comments ..."
2,1,81,1_promotion_self_giveaways_spam,"[promotion, self, giveaways, spam, surveys, ad...","[self promotion, self promotion, self promotion]"
3,2,51,2_rules_content_rule_follow,"[rules, content, rule, follow, restricted, red...","[specific restricted content see rules list, r..."
4,3,46,3_links_piracy_exploits_cheats,"[links, piracy, exploits, cheats, pirated, lin...","[direct links copyrighted material, links disc..."
5,4,37,4_posts_must_relevant_deck,"[posts, must, relevant, deck, related, steam, ...",[game review deck posts must include game titl...
6,5,33,5_screenshots_memes_meme_image,"[screenshots, memes, meme, image, pictures, te...","[screenshots, screenshots memes infographics, ..."
7,6,31,6_discussion_thread_threads_megathread,"[discussion, thread, threads, megathread, use,...","[discussion moderation meta sub topics, use da..."
8,7,30,7_titles_title_clickbait_editorialized,"[titles, title, clickbait, editorialized, arti...","[editorialized misleading titles, title must m..."
9,8,28,8_nsfw_spoilers_gore_porn,"[nsfw, spoilers, gore, porn, mark, dick, death...","[mark nsfw content, mark nsfw spoilers, proper..."


In [None]:
# try consensus clutering on topics shared between servers