In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [43]:
# Get rid of 't1_' or 't3_'.
def extract_id(parent_id):
    return parent_id[3:]

# If 't1_' get rid of the prefix, else return 0
def extract_comm_ids(parent_id):
    if parent_id.startswith('t1_'):
        return parent_id[3:]
    else: 
        return 0

In [44]:
filename = "c:/Users/HP/Desktop/UNI/LM_1/SNA/SNA/data/small_comm.csv"
df_comments_orig = pd.read_csv(filename)

# Drop all Nan values.
df_comments = df_comments_orig.dropna(subset=['link_id']).copy()

# Apply the function to create a new column 'parent_comment_id'.
df_comments['parent_comment_id'] = df_comments['parent_id'].apply(extract_id)

# Apply the function to create a new column 'parent_submission_id'.
df_comments['parent_submission_id'] = df_comments['link_id'].apply(extract_id)

# Create an array with all the unique ids.
comments_ids = df_comments['id'].unique()

In [45]:
filename = "c:/Users/HP/Desktop/UNI/LM_1/SNA/SNA/data/small_users.csv"

df_submissions = pd.read_csv(filename)

# Create an array with all the unique ids.
submissions_ids = df_submissions['id'].unique()

In [46]:
# Remove all rows with "parent_comment_id" not present in comment_ids or submissions_ids.
df_comments = df_comments[df_comments['parent_comment_id'].isin(np.concatenate((comments_ids, submissions_ids)))]

# Drop useless columns.
df_comments = df_comments.drop(columns=['author_flair_text','body','parent_id', 'link_id', 'score'])

In [47]:
# Create df_full that contains all comments and submissions.
df_full = pd.concat([df_submissions[['id','author','author_flair_css_class']], df_comments_orig[['id','author','author_flair_css_class']]], axis=0)

# Create df_auth_flair that contains all authors and their flair.
df_auth_flair = df_full[['author','author_flair_css_class']].drop_duplicates()

In [48]:
# Create a dictionary that links id to author.
id_to_auth = df_full.set_index('id')['author'].to_dict()

In [49]:
# Create a columns "parent_author" with the author of the parent comment/submission.
df_comments['parent_author'] = df_comments['parent_comment_id'].apply(lambda x: id_to_auth[x])

In [50]:
# Create a list b in the format needed to build the network.
a = df_comments[['author','parent_author']].values.tolist()

b = []
for elem in a:
    # Don't consider autoloop.
    if elem[0] != elem[1]:
        b.append(f'{elem[0]} {elem[1]}')

In [51]:
# Create a dataframe with edge and its weight.
df_b = pd.DataFrame(b, columns=['edge'])
grouped_df = df_b.groupby(list(df_b.columns)).size().reset_index(name='count')

In [52]:
grouped_df.sort_values('count', ascending=False)

Unnamed: 0,edge,count
228526,Lastrevio snowylion,219
725633,snowylion Lastrevio,184
423706,Tyuee angeldevilmybeloved,155
94533,DWLlama DaintyDamnation,132
95971,DaintyDamnation DWLlama,124
...,...,...
290958,NunuG0ddess Ailuros8833,1
290959,NunuG0ddess Belial0909,1
290961,NunuG0ddess ChaiTK,1
290962,NunuG0ddess Cuddlyzombie91,1


In [53]:
(df_auth_flair['author'].value_counts()>1).value_counts()

count
False    51658
True      3414
Name: count, dtype: int64

In [54]:
# This author has had 11 flair (on a total of 16 possibile flair).
df_auth_flair[df_auth_flair['author'] == 'unqy']

Unnamed: 0,author,author_flair_css_class
1327,unqy,infj
1519,unqy,infp
1646,unqy,isfp
2007,unqy,enfj
2475,unqy,intp
42481,unqy,enfp
44568,unqy,entp
58495,unqy,isfj
89073,unqy,estp
90037,unqy,istp


In [55]:
# Drop all the rows with authors that have more than one flair.
duplicates = df_auth_flair['author'].value_counts()[df_auth_flair['author'].value_counts() > 1].index
df_auth_flair = df_auth_flair[~df_auth_flair['author'].isin(duplicates)]

In [56]:
(df_auth_flair['author'].value_counts()>1).value_counts()

count
False    51658
Name: count, dtype: int64

### Network

In [57]:
import networkx as nx