# **Get Unique Users**

In [1]:
import pandas as pd
import zipfile
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
# import files and extract unique users

def extract_author_counts(subreddit):
    filepath = f"../preprocessing/{subreddit}/output/{subreddit}_new_preprocessing_com_rep.csv"
    data = pd.read_csv(filepath)

    # Combine all author columns into one list
    all_authors = pd.concat([data['author_child'], data['author_parent'], data['author_submission']])
    
    # Count occurrences
    author_counts = Counter(all_authors)

    author_df = pd.DataFrame(author_counts.items(), columns=['author', 'count'])
    author_df['subreddit'] = subreddit  # Optional: add subreddit info

    return author_df


In [3]:
# run on all subreddits

subreddits = ["Ask_Politics", "Askpolitics", "PoliticalDebate", "PoliticalDiscussion", "NeutralPolitics"]
all_author_data = pd.concat([extract_author_counts(sub) for sub in subreddits], ignore_index=True)


# strip PoliticalDebate and remove mods
all_author_data['author'] = all_author_data['author'].apply(lambda x: x.strip('"'))
all_author_data = all_author_data[all_author_data['author'] != "AutoModerator"].reset_index(drop=True)

all_author_data

Unnamed: 0,author,count,subreddit
0,karmanaut,141,Ask_Politics
1,zoolander951,1,Ask_Politics
2,TehNoff,2,Ask_Politics
3,cyco,40,Ask_Politics
4,zossima,6,Ask_Politics
...,...,...,...
158068,BluejayImaginary7739,2,NeutralPolitics
158069,s0ngo,1,NeutralPolitics
158070,what_cha_want,4,NeutralPolitics
158071,Lux_Aquila,1,NeutralPolitics


In [4]:
all_author_data.groupby('subreddit')['author'].size()

subreddit
Ask_Politics            24427
Askpolitics              3382
NeutralPolitics         22938
PoliticalDebate          2290
PoliticalDiscussion    105036
Name: author, dtype: int64

In [5]:
# extract top x per subreddit

def get_top_x(df, subr, x):

    
    return df[df['subreddit' ] == subr].sort_values("count", ascending=False)[:x]


In [6]:
top_100 =  pd.concat([get_top_x(all_author_data, sub, 100).reset_index(drop = True) for sub in subreddits], ignore_index=True)
top_100

Unnamed: 0,author,count,subreddit
0,AuditorTux,2029,Ask_Politics
1,thisfunnieguy,919,Ask_Politics
2,chinmakes5,841,Ask_Politics
3,mormagils,799,Ask_Politics
4,gsfgf,731,Ask_Politics
...,...,...,...
495,ANewMachine615,92,NeutralPolitics
496,Eighty-8,91,NeutralPolitics
497,Kamwind,91,NeutralPolitics
498,Panzerdrek,91,NeutralPolitics


## Import politics authors separately

$\to$ redo, one having the list that was made with df, not `df_comments` and `df_submission`

In [7]:
subreddit = "politics"

authors_filename = f"output/politics_authors.csv"
authors_path = f"../preprocessing/{subreddit}/{authors_filename}" 
 
authors_politics = pd.read_csv(authors_path)

authors_politics = authors_politics[~authors_politics['author'].isin(["PoliticsMod", "PoliticsModeratorBot", "AutoModerator"])].reset_index(drop = True)
authors_politics = authors_politics.sort_values('count', ascending=False).reset_index(drop = True)
authors_politics

Unnamed: 0,author,count,subreddit
0,Qu1nlan,137739,politics
1,english06,121853,politics
2,therealdanhill,88413,politics
3,lotrouble,33609,politics
4,dottiemommy,31494,politics
...,...,...,...
721664,ice-cream420,1,politics
721665,youpeoplestolemyname,1,politics
721666,Gugabvs,1,politics
721667,lupin-dubious,1,politics


In [89]:
# combine all users
users_all = pd.concat([all_author_data, authors_politics]).groupby('author', as_index=False)[['count']].sum().sort_values(by='count', ascending=False).reset_index(drop = True)
users_all

Unnamed: 0,author,count
0,Qu1nlan,137739
1,english06,121853
2,therealdanhill,88413
3,Anxa,58476
4,lotrouble,33609
...,...,...
804509,fasulo_,1
804510,fastwall,1
804511,LastWalker,1
804512,Last_Account_Ever,1


In [86]:
politics_top_100 = authors_politics[:100]

## Concatenate and Export

In [87]:
top_100_all = pd.concat([top_100, politics_top_100])

In [90]:
combined_counts = top_100_all.groupby('author', as_index=False)[['count']].sum().sort_values(by='count', ascending=False).reset_index(drop = True)
combined_counts

Unnamed: 0,author,count
0,Qu1nlan,137739
1,english06,121853
2,therealdanhill,88413
3,Anxa,58438
4,lotrouble,33609
...,...,...
576,ted5011c,30
577,Poop__Pirates,30
578,BeKindToEachOther6,30
579,UnpopularUrsula,30


In [91]:
users_to_pull = combined_counts
users_to_pull

Unnamed: 0,author,count
0,Qu1nlan,137739
1,english06,121853
2,therealdanhill,88413
3,Anxa,58438
4,lotrouble,33609
...,...,...
576,ted5011c,30
577,Poop__Pirates,30
578,BeKindToEachOther6,30
579,UnpopularUrsula,30


In [92]:

users_all.to_csv("output/users_all.csv", index = False)
users_to_pull.to_csv("output/users_to_pull.csv", index = False)

In [3]:

users_all = pd.read_csv("output/users_all.csv")
users_to_pull = pd.read_csv("output/users_to_pull.csv")

In [94]:
len(users_all)


804514

In [95]:
len(users_to_pull)

581

In [1]:
import pandas as pd
users_all = pd.read_csv("output/users_all.csv")

In [4]:
users_all[["author"]].to_csv("output/authors_all.csv")