In [None]:
from collections import Counter
from pymongo import MongoClient
from sklearn.preprocessing import normalize
from wasabi import msg
import datetime
import json
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import powerlaw
import seaborn as sns

sns.set_theme(style="ticks")

## Input data

In [None]:
# Here data from the .csv should be processed
# The suggested library for managing the data is pandas
# ...

## Descriptive analysis

1. Distribution of comments (per len, per score, per no. of authors)

In [None]:
dist = list(map(lambda b: len(str(b)), comments_df.body))
k = Counter(dist)

pl = powerlaw.Fit(dist, discrete=True, verbose=False)
print(f'alpha = {pl.alpha}')
print(f'∂ = {pl.D}')

msg.info("No. of comments having length n:")
msg.info(f"min: {min(dist)}, avg: {np.mean(dist)}, max: {max(dist)}")

fig, ax = plt.subplots(figsize=(8,5))

sns.scatterplot(x=list(k.keys()), y=[k[x] for x in k.keys()], ax=ax, linewidth=0., alpha=.75);

ax.set_ylabel('No. of comments', **{'fontsize': 14})
ax.set_xlabel('Comment length', **{'fontsize': 14})
ax.set_xscale('log')
ax.set_yscale('log')
ax.grid(True)

In [None]:
dist = comments_df.score.values
k = Counter(dist)

print('negative score')
pl = powerlaw.Fit(dist[dist < 0]*-1, discrete=True, verbose=False)
print(f'alpha = {pl.alpha}')
print(f'∂ = {pl.D}')

print('positive score')
pl = powerlaw.Fit(dist[dist > 0], discrete=True, verbose=False)
print(f'alpha = {pl.alpha}')
print(f'∂ = {pl.D}')

msg.info("No. of comments having score s:")
msg.info(f"min: {min(dist)}, avg: {np.mean(dist)}, max: {max(dist)}")

fig, ax = plt.subplots(1, 2, gridspec_kw={'width_ratios': [2, 4]}, figsize=(8,5))

pos_x = list(filter(lambda x: x > 0, k.keys()))
pos_y = [k[x] for x in pos_x]
neg_x = list(filter(lambda x: x <= 0, k.keys()))
neg_y = [k[x] for x in neg_x]

sns.scatterplot(x=pos_x, y=pos_y, linewidth=0., alpha=.75, ax=ax[1]);
sns.scatterplot(x=neg_x, y=neg_y, linewidth=0., alpha=.75, ax=ax[0]);

ax[0].set_ylabel('No. of comments', **{'fontsize': 14})
ax[0].set_xlabel('Score (negative)', **{'fontsize': 14})
ax[1].set_ylabel('')
ax[1].set_xlabel('Score (positive)', **{'fontsize': 14})

ax[0].set_yscale('log')
ax[1].set_yscale('log')

ax[0].set_xscale('symlog')
ax[1].set_xscale('symlog')

ax[0].grid(True)
ax[1].grid(True)

In [None]:
dist = comments_df.groupby('author').count().id.values
k = Counter(dist)

pl = powerlaw.Fit(dist, discrete=True, verbose=False)
print(f'alpha = {pl.alpha}')
print(f'∂ = {pl.D}')

msg.info("No. of comments per n. of authors n:")
msg.info(f"min: {min(dist)}, avg: {np.mean(dist)}, max: {max(dist)}")

fig, ax = plt.subplots(figsize=(8,5))

sns.scatterplot(x=list(k.keys()), y=[k[x] for x in k.keys()], ax=ax, linewidth=0., alpha=.75);

ax.set_ylabel('No. of comments', **{'fontsize': 14})
ax.set_xlabel('No. of authors', **{'fontsize': 14})
ax.set_yscale('log')
ax.set_xscale('log')
ax.grid(True)

## Team Identification

In [None]:
teams_flairs = ['cnjdg', 'cnlgd', 'cnsng', 'cntop', 'ruuol', 'eufnc', 'eug2', 'euml', 'eurogue', 'kodwg', 'kokdx', 'kogen', 'nafq', 'natl', 'natsm', 'twmad', 'cnpsg']

In [None]:
teams_count = list()
teams_authors = dict()

for team_flair in teams_flairs:
    mask = comments_df.cflairs.apply(lambda x, d=team_flair: d in map(lambda y: y.lower(), eval(x)))
    teams_count.append((team_flair, len(comments_df[mask].author.unique())))
    teams_authors[team_flair] = set(comments_df[mask].author.unique())

teams_count = pd.DataFrame.from_records(teams_count, columns=['team', 'count'])
print("Avg no. of authors supporting a team:", teams_count['count'].mean(), teams_count['count'].std())

In [None]:
team_color_palette = (sns.color_palette()[0],) * 4 + (sns.color_palette()[3],) * 1 + (sns.color_palette()[1],) * 4 + (sns.color_palette()[6],) * 3 + (sns.color_palette()[-2],) * 3 + (sns.color_palette()[2],) * 1 + (sns.color_palette()[4],) * 1

In [None]:
fig, ax = plt.subplots(figsize=(8,5))

sns.barplot(y='team', x='count', data=teams_count, orient='h', ax=ax, linewidth=0., alpha=1.0, palette=team_color_palette)
ax.set_ylabel('Team', **{'fontsize': 14})
ax.set_xlabel('No. of authors', **{'fontsize': 14})
ax.grid(True)

## Network-based Analysis

In [None]:
ain_nodes = comments_df.author.unique()
ain_edges = set()

for record in comments_df.itertuples():
    u = record.author
    parent_type, parent_id = record.parent_id.split('_')
    if parent_type == 't1':
        comment_record = comments_df[comments_df.id == parent_id]
        if len(comment_record) > 0:
            v = comment_record['author'].values[0]
            ain_edges.add((u,v))
    else:
        afs = threads_df[threads_df.id == parent_id]['author'].values[0]
        ain_edges.add((u, afs))
            
ain_nodes_cmp = {u: np.mean(comments_df[comments_df.author == u].pol_cmp) for u in ain_nodes}
ain_nodes_score = {u: np.mean(comments_df[comments_df.author == u].score) for u in ain_nodes}

In [None]:
print('nodes, edges, avg. indegree:', ain.number_of_nodes(), ain.number_of_edges(), sum(map(lambda x: x[1], ain.in_degree())) / ain.number_of_nodes(), sum(map(lambda x: x[1], ain.out_degree())) / ain.number_of_nodes())
print('density:', nx.density(ain))
print('clustering:', nx.average_clustering(ain))
print('n. of s connected components:', len(list(nx.weakly_connected_components(ain))))
print('size of the maximum s connected components:', len(max(nx.weakly_connected_components(ain), key=len)))

In [None]:
betw_cent = nx.algorithms.betweenness_centrality(ain, normalized=True)
betw_cent_sorted = list(map(lambda u, d=betw_cent: (u, d[u]), sorted(betw_cent, key=lambda x: betw_cent[x], reverse=True)))[:20]
eigen_cent = nx.algorithms.eigenvector_centrality(ain)
eigen_cent_sorted = list(map(lambda u, d=eigen_cent: (u, d[u]), sorted(eigen_cent, key=lambda x: eigen_cent[x], reverse=True)))[:20]

In [None]:
tin_nodes = teams_flairs
tin_edges = list()

for record in comments_df.itertuples():
    tfs_i = eval(record.cflairs)
    parent_type, parent_id = record.parent_id.split('_')
    if parent_type == 't1':
        comment_record = comments_df[comments_df.id == parent_id]
        if len(comment_record) > 0:
            tfs_j = eval(comment_record['cflairs'].values[0])
            for ui in tfs_i:
                for uj in tfs_i:
                    if ui in teams_flairs and uj in teams_flairs:
                        tin_edges.append((ui, uj))
    else:
        afs = [k['a'][1:-1] for k in eval(threads_df[threads_df.id == parent_id]['author_flair_richtext'].values[0]) if 'a' in k]
        for ui in tfs_i:
            for uj in afs:
                if ui in teams_flairs and uj in teams_flairs:
                    tin_edges.append((ui, uj))            

In [None]:
freq = Counter(tin_edges)
tin_edges_weight = [(k[0], k[1], freq[k])for k in list(freq)]

In [None]:
tin = nx.DiGraph()
tin.add_nodes_from(tin_nodes)
tin.add_weighted_edges_from(tin_edges_weight)

nx.write_graphml(tin, 'tin.graphml')

In [None]:
print('nodes, edges, avg. indegree:', tin.number_of_nodes(), tin.number_of_edges(), sum(map(lambda x: x[1], tin.in_degree())) / tin.number_of_nodes(), sum(map(lambda x: x[1], tin.out_degree())) / tin.number_of_nodes())
print('density:', nx.density(tin))
print('clustering:', nx.average_clustering(tin))
print('n. of s connected components:', len(list(nx.weakly_connected_components(tin))))
print('size of the maximum s connected components:', len(max(nx.weakly_connected_components(tin), key=len)))
print('avg edge weight:', np.mean(list(map(lambda x: x[-1], tin_edges_weight))))
print('std edge weight:', np.std(list(map(lambda x: x[-1], tin_edges_weight))))

In [None]:
tin_df = pd.DataFrame.from_records(tin_edges_weight, columns=['t1', 't2', 'w'])
tin_df_2 = tin_df.pivot("t1", "t2", "w").fillna(0).astype('int32')

fig, ax = plt.subplots(figsize=(10,5.75))
sns.heatmap(tin_df_2, annot=True, fmt='d', cmap="CMRmap_r", linewidths=.5, color="#333333", ax=ax, alpha=.75);

ax.set_ylabel('', **{'fontsize': 14});
ax.set_xlabel('', **{'fontsize': 14});

ax.patch.set_edgecolor('black')
ax.patch.set_linewidth('1')

In [None]:
sent_team = list()
for record in comments_df.itertuples():
    tfs_i = eval(record.cflairs)
    for tf in tfs_i:
        if tf in teams_flairs:
            sent_team.append((tf, record.pol_cmp))

sent_team_ordered = list()
for tf in teams_flairs:
    all_of_tf = list(filter(lambda x, t=tf: x[0] == t, sent_team))
    sent_team_ordered += all_of_tf
            
sent_team = pd.DataFrame.from_records(sent_team_ordered, columns=['team', 'sentiment'])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))

sns.boxplot(data=sent_team, x='sentiment', y='team', width=0.525, palette='colorblind', linewidth=0.75, ax=ax);

ax.set_xlim([-1.0, 1.0]);

ax.set_ylabel('', **{'fontsize': 14});
ax.set_xlabel('Sentiment value', **{'fontsize': 14});
ax.grid(True)

In [None]:
score_team = list()
for record in comments_df.itertuples():
    tfs_i = eval(record.cflairs)
    for tf in tfs_i:
        if tf in teams_flairs:
            score_team.append((tf, record.score))

score_team_ordered = list()
for tf in teams_flairs:
    all_of_tf = list(filter(lambda x, t=tf: x[0] == t, score_team))
    score_team_ordered += all_of_tf
            
score_team = pd.DataFrame.from_records(score_team_ordered, columns=['team', 'score'])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))

sns.stripplot(data=score_team, x='score', y='team', palette='colorblind', ax=ax);

ax.set_ylabel('', **{'fontsize': 14});
ax.set_xlabel('Comment score', **{'fontsize': 14});
ax.grid(True)

## Breakpoints

In [None]:
def bp_analysis(tid_bp, flair_str):
    bp = threads_df[threads_df.id == tid_bp].created.values[0]
    before_bp_df = comments_df[(comments_df.cflairs.str.contains(flair_str)) & (comments_df.created < bp)]
    after_bp_df = comments_df[(comments_df.cflairs.str.contains(flair_str)) & (comments_df.created >= bp)]
    
    msg.info("No. of comments before bp: {}".format(len(before_bp_df)))
    msg.info("No. of comments after bp: {}".format(len(after_bp_df)))

    msg.info("No. of distinct authors before bp: {}".format(len(before_bp_df.author.unique())))
    msg.info("No. of distinct authors before bp: {}".format(len(after_bp_df.author.unique())))

    overlapping_authors = set(before_bp_df.author.values).intersection(set(after_bp_df.author.values)) 
    msg.info("No. of authors overlapping between before and after: {}".format(len(overlapping_authors)))

    msg.info("Max, min and avg sentiment before: {} {} {} (± {})".format(
        before_bp_df.pol_cmp.max(),
        before_bp_df.pol_cmp.min(),
        before_bp_df.pol_cmp.mean(),
        before_bp_df.pol_cmp.std()))
    msg.info("Max, min and avg sentiment after: {} {} {} (± {})".format(
        after_bp_df.pol_cmp.max(),
        after_bp_df.pol_cmp.min(),
        after_bp_df.pol_cmp.mean(),
        after_bp_df.pol_cmp.std()))
    
    msg.info("{} {}".format(before_bp_df.score.mean(), before_bp_df.score.std()))
    msg.info("{} {}".format(after_bp_df.score.mean(), after_bp_df.score.std()))
    
    print(ranksums(before_bp_df.pol_cmp.values, after_bp_df.pol_cmp.values))
    print(ranksums(before_bp_df.tb_pol.values, after_bp_df.tb_pol.values))
    print(ranksums(before_bp_df.score.values, after_bp_df.score.values))

    fig, ax = plt.subplots(2, 1, figsize=(8,5))
    sns.histplot(data=before_bp_df.pol_cmp.values, bins=30, ax=ax[0])
    sns.histplot(data=after_bp_df.pol_cmp.values, bins=30, ax=ax[1])

    return before_bp_df, after_bp_df, overlapping_authors

def extract_bp_df(tid_bp='jct1io', flair_str='eufnc'):
    bp = threads_df[threads_df.id == tid_bp].created.values[0]
    before_bp_df = comments_df[(comments_df.cflairs.str.contains(flair_str)) & (comments_df.created < bp)]
    after_bp_df = comments_df[(comments_df.cflairs.str.contains(flair_str)) & (comments_df.created >= bp)]
    
    return (before_bp_df, after_bp_df)