# Dataset Analysis

## Import

In [1]:
import csv
import pyreadr
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

# Dataset

In [2]:
# Read tsv file
r_body = pd.read_csv('soc-redditHyperlinks-body.tsv', sep='\t')
r_title = pd.read_csv('soc-redditHyperlinks-title.tsv', sep='\t')

# Merge two dataframes
r = pd.concat([r_body, r_title])
r = r.drop(['PROPERTIES'], axis=1)

r_neg = r[r['LINK_SENTIMENT'] == -1]
r_pos = r[r['LINK_SENTIMENT'] == 1]


In [3]:
r_body.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


In [4]:
r_title.sample(10)

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
517657,gamedev,indiegaming,5rvv2fs,2017-02-03 11:02:52,1,"108.0,92.0,0.787037037037,0.0,0.138888888889,0..."
170517,goldredditsays,videos,31vd51s,2015-04-09 05:23:18,1,"78.0,70.0,0.692307692308,0.0512820512821,0.089..."
515338,shitthe_donaldsays,trumpforprison,5r6cq0s,2017-01-30 21:37:31,1,"94.0,84.0,0.808510638298,0.0212765957447,0.063..."
22050,switcharoo,mapporn,20a8fcs,2014-03-12 19:08:28,1,"40.0,35.0,0.8,0.0,0.2,0.15,0.05,7.0,7.0,1.0,4...."
417487,switcharoo,oddlysatisfying,4x5pszs,2016-08-11 01:03:34,1,"27.0,24.0,0.814814814815,0.0,0.296296296296,0...."
531910,kappa,fighters,5wgssqs,2017-02-27 05:40:53,1,"109.0,91.0,0.798165137615,0.00917431192661,0.1..."
237923,drugs,netsec,3hyyzss,2015-08-22 08:08:53,1,"85.0,72.0,0.788235294118,0.0,0.141176470588,0...."
318472,negativewithgold,quityourbullshit,43d10ps,2016-01-30 08:08:20,1,"53.0,45.0,0.660377358491,0.0377358490566,0.113..."
83222,samneill,funny,2fb052s,2014-09-04 07:39:58,1,"63.0,57.0,0.84126984127,0.0,0.142857142857,0.1..."
375104,bedrocklinux,asklinuxusers,4keqxas,2016-05-21 11:17:14,1,"44.0,40.0,0.818181818182,0.0,0.295454545455,0...."


In [5]:
print("Percentage of positive and negative links in title")
print(r_title['LINK_SENTIMENT'].value_counts(normalize=True))

print("\nPercentage of positive and negative links in body")
print(r_body['LINK_SENTIMENT'].value_counts(normalize=True))

Percentage of positive and negative links in title
 1    0.893098
-1    0.106902
Name: LINK_SENTIMENT, dtype: float64

Percentage of positive and negative links in body
 1    0.926473
-1    0.073527
Name: LINK_SENTIMENT, dtype: float64


In [6]:
# probability that a negative link is in the body
print("Probability that a negative link is in the body")
print(len(r_body[r_body['LINK_SENTIMENT'] == -1]) / len(r_neg))

# probability that a positive link is in the title
print("Probability that a positive link is in the title")
print(len(r_title[r_title['LINK_SENTIMENT'] == 1]) / len(r_neg))


Probability that a negative link is in the body
0.2562948546405547
Probability that a positive link is in the title
6.213197907797105


# Subreddit similarity

In [17]:
subreddit_embeddings = pyreadr.read_r('subredsimdata.rdata')
subreddit_embeddings = pd.DataFrame(subreddit_embeddings["subsimmat"])

cosine_sim = cosine_similarity(subreddit_embeddings, subreddit_embeddings)
indices = pd.Series(subreddit_embeddings.index)

In [39]:
def similarity_score(sub1, sub2):
    if sub1 not in indices.values or sub2 not in indices.values:
        return np.nan
    idx1 = indices[indices == sub1].index[0]
    idx2 = indices[indices == sub2].index[0]
    return cosine_sim[idx1][idx2]


# Graph

In [8]:
# get graph with networkx
G_neg = nx.from_pandas_edgelist(r_neg, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT',edge_attr='LINK_SENTIMENT', create_using=nx.DiGraph())
G_pos = nx.from_pandas_edgelist(r_pos, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT',edge_attr='LINK_SENTIMENT', create_using=nx.DiGraph())
G = nx.from_pandas_edgelist(r, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT',edge_attr='LINK_SENTIMENT', create_using=nx.DiGraph())

# remove self loop
G_neg.remove_edges_from(nx.selfloop_edges(G_neg))
G_pos.remove_edges_from(nx.selfloop_edges(G_pos))
G.remove_edges_from(nx.selfloop_edges(G))

# remove nodes with degree 0
G_neg.remove_nodes_from(list(nx.isolates(G_neg)))
G_pos.remove_nodes_from(list(nx.isolates(G_pos)))
G.remove_nodes_from(list(nx.isolates(G)))

# print number of nodes
print('Number of nodes in negative graph: ', G_neg.number_of_nodes())
print('Number of nodes in positive graph: ', G_pos.number_of_nodes())
print('Number of nodes in total graph: ', G.number_of_nodes())


Number of nodes in negative graph:  12069
Number of nodes in positive graph:  65892
Number of nodes in total graph:  67180


In [9]:
deg_neg = dict(G_neg.degree())
deg_pos = dict(G_pos.degree())
deg = dict(G.degree())

max_deg_neg = max(deg_neg.values())
max_deg_pos = max(deg_pos.values())
max_deg = max(deg.values())

print('\nMax degree of negative graph: ', max_deg_neg)
print('Max degree of positive graph: ', max_deg_pos)
print('Max degree of total graph: ', max_deg)

# out degree
out_deg_neg = dict(G_neg.out_degree())
out_deg_pos = dict(G_pos.out_degree())
out_deg = dict(G.out_degree())

# in degree
in_deg_neg = dict(G_neg.in_degree())
in_deg_pos = dict(G_pos.in_degree())
in_deg = dict(G.in_degree())


Max degree of negative graph:  2056
Max degree of positive graph:  5504
Max degree of total graph:  5811


In [10]:
# sort out degree
out_deg_neg = {k: v for k, v in sorted(out_deg_neg.items(), key=lambda item: item[1], reverse=True)}
out_deg_pos = {k: v for k, v in sorted(out_deg_pos.items(), key=lambda item: item[1], reverse=True)}
out_deg = {k: v for k, v in sorted(out_deg.items(), key=lambda item: item[1], reverse=True)}

in_deg_neg = {k: v for k, v in sorted(in_deg_neg.items(), key=lambda item: item[1], reverse=True)}
in_deg_pos = {k: v for k, v in sorted(in_deg_pos.items(), key=lambda item: item[1], reverse=True)}
in_deg = {k: v for k, v in sorted(in_deg.items(), key=lambda item: item[1], reverse=True)}

print("total unique nodes: ", len(set(list(out_deg_neg.keys()) + list(out_deg_pos.keys()) + list(out_deg.keys()))))
print("nodes with out degree neg > 10: ", len([x for x in out_deg_neg.values() if x > 10])/len(out_deg))
print("nodes with in degree neg > 10: ", len([x for x in in_deg_neg.values() if x > 10])/len(out_deg))

total unique nodes:  67180
nodes with out degree neg > 10:  0.00821673116999107
nodes with in degree neg > 10:  0.009749925573087228


In [11]:
# mean out degree
mean_out_deg_neg = np.mean(list(out_deg_neg.values()))
mean_out_deg_pos = np.mean(list(out_deg_pos.values()))
mean_out_deg = np.mean(list(out_deg.values()))

print('\nMean out degree of negative graph: ', mean_out_deg_neg)
print('Mean out degree of positive graph: ', mean_out_deg_pos)
print('Mean out degree of total graph: ', mean_out_deg)


Mean out degree of negative graph:  3.411218825089071
Mean out degree of positive graph:  4.879788138165483
Mean out degree of total graph:  5.055715986900863


In [12]:
print("\nfirst 10 nodes with highest out degree in negative graph")
for i in range(10):
    print("\t", list(out_deg_neg.keys())[i], list(out_deg_neg.values())[i])

print("\nfirst 10 nodes with highest out degree in positive graph")
for i in range(10):
    print("\t", list(out_deg_pos.keys())[i], list(out_deg_pos.values())[i])

print("\nfirst 10 nodes with highest out degree in total graph")
for i in range(10):
    print("\t", list(out_deg.keys())[i], list(out_deg.values())[i])

print("\nfirst 10 nodes with highest in degree in negative graph")
for i in range(10):
    print("\t", list(in_deg_neg.keys())[i], list(in_deg_neg.values())[i])

print("\nfirst 10 nodes with highest in degree in positive graph")
for i in range(10):
    print("\t", list(in_deg_pos.keys())[i], list(in_deg_pos.values())[i])

print("\nfirst 10 nodes with highest in degree in total graph")
for i in range(10):
    print("\t", list(in_deg.keys())[i], list(in_deg.values())[i])




first 10 nodes with highest out degree in negative graph
	 subredditdrama 1759
	 bestof 1091
	 drama 839
	 botsrights 385
	 circlebroke2 361
	 shitpost 337
	 shitredditsays 293
	 shitliberalssay 278
	 shitamericanssay 273
	 the_donald 255

first 10 nodes with highest out degree in positive graph
	 bestof 2881
	 subredditdrama 2541
	 titlegore 2466
	 drama 1091
	 hailcorporate 907
	 switcharoo 901
	 shitredditsays 873
	 gaming 774
	 shitamericanssay 747
	 the_donald 716

first 10 nodes with highest out degree in total graph
	 bestof 3111
	 subredditdrama 3020
	 titlegore 2469
	 drama 1413
	 hailcorporate 939
	 shitredditsays 923
	 switcharoo 918
	 the_donald 798
	 shitamericanssay 793
	 botsrights 792

first 10 nodes with highest in degree in negative graph
	 askreddit 1108
	 funny 506
	 pics 500
	 worldnews 489
	 videos 472
	 todayilearned 460
	 news 445
	 iama 390
	 adviceanimals 332
	 politics 311

first 10 nodes with highest in degree in positive graph
	 askreddit 5165
	 iama 4404


## Conflicts

In [13]:
conflicts = r_neg.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).size().reset_index(name='counts')
conflicts = conflicts.sort_values(by=['counts'], ascending=False)
conflicts = conflicts[conflicts['counts'] > 10]
conflicts.head(10)

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,counts
31082,shitredditsays,twoxchromosomes,120
34945,subredditdrama,relationships,100
14431,evenwithcontext,askreddit,97
1102,amrsucks,againstmensrights,89
33765,subredditdrama,askreddit,88
34837,subredditdrama,pics,85
32907,srssucks,shitredditsays,79
35219,subredditdrama,todayilearned,79
35314,subredditdrama,videos,78
9006,circlebroke2,pics,77


In [14]:
# % of nodes that are in conflict 
node_in_conflicts = set(conflicts["SOURCE_SUBREDDIT"].unique()).union(set(conflicts["TARGET_SUBREDDIT"].unique()))
print('Number of nodes in conflict: ', len(node_in_conflicts))
print('Number of nodes in total graph: ', G.number_of_nodes())
print('Percentage of nodes in conflict: ', len(node_in_conflicts)/G.number_of_nodes())

attacker_nodes = set(conflicts["SOURCE_SUBREDDIT"].unique())
print('\nNumber of attacker nodes: ', len(attacker_nodes))
print('Number of nodes in total graph: ', G.number_of_nodes())
print('Percentage of attacker nodes: ', len(attacker_nodes)/G.number_of_nodes())

attacked_nodes = set(conflicts["TARGET_SUBREDDIT"].unique())
print('\nNumber of attacked nodes: ', len(attacked_nodes))
print('Number of nodes in total graph: ', G.number_of_nodes())
print('Percentage of attacked nodes: ', len(attacked_nodes)/G.number_of_nodes())

# % of attacker nodes that are also attacked
print('\nNumber of attacker nodes that are also attacked: ', len(attacker_nodes.intersection(attacked_nodes)))
print('Number of attacker nodes: ', len(attacker_nodes))
print('Percentage of attacker nodes that are also attacked: ', len(attacker_nodes.intersection(attacked_nodes))/len(attacker_nodes))


Number of nodes in conflict:  530
Number of nodes in total graph:  67180
Percentage of nodes in conflict:  0.007889252753795772

Number of attacker nodes:  240
Number of nodes in total graph:  67180
Percentage of attacker nodes:  0.003572491813039595

Number of attacked nodes:  365
Number of nodes in total graph:  67180
Percentage of attacked nodes:  0.005433164632331051

Number of attacker nodes that are also attacked:  75
Number of attacker nodes:  240
Percentage of attacker nodes that are also attacked:  0.3125


In [15]:
G_conflicts = nx.from_pandas_edgelist(conflicts, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', edge_attr='counts', create_using=nx.DiGraph())

In [22]:
# for each couple in conflict, get similarity
conflict_similarities = []
for edge in G_conflicts.edges():
    score = similarity_score(edge[0], edge[1])
    if score > 0:
        conflict_similarities.append((edge[0], edge[1], score))

# mean 
print('Mean similarity score of conflict edges: ', np.mean([x[2] for x in conflict_similarities]))

Mean similarity score of conflict edges:  0.3512435193020667


# Measures

## Conflict graph

In [None]:
# in degree
in_deg_conflicts = dict(G_conflicts.in_degree())
in_deg_conflicts = {k: v for k, v in sorted(in_deg_conflicts.items(), key=lambda item: item[1], reverse=True)}

# out degree
out_deg_conflicts = dict(G_conflicts.out_degree())
out_deg_conflicts = {k: v for k, v in sorted(out_deg_conflicts.items(), key=lambda item: item[1], reverse=True)}

# closeness centrality
closeness_conflicts = nx.closeness_centrality(G_conflicts)
closeness_conflicts = {k: v for k, v in sorted(closeness_conflicts.items(), key=lambda item: item[1], reverse=True)}

# betweenness centrality
betweenness_conflicts = nx.betweenness_centrality(G_conflicts)
betweenness_conflicts = {k: v for k, v in sorted(betweenness_conflicts.items(), key=lambda item: item[1], reverse=True)}

# eigenvector centrality
eigenvector_conflicts = nx.eigenvector_centrality(G_conflicts)
eigenvector_conflicts = {k: v for k, v in sorted(eigenvector_conflicts.items(), key=lambda item: item[1], reverse=True)}

## Full graph

In [None]:
closeness_full = nx.closeness_centrality(G)
betweenness_full = nx.betweenness_centrality(G)
eigenvector_full = nx.eigenvector_centrality(G)

## Negative Graph

In [None]:
closeness_neg = nx.closeness_centrality(G_neg)
betweenness_neg = nx.betweenness_centrality(G_neg)
eigenvector_neg = nx.eigenvector_centrality(G_neg)

## Positive Graph

In [None]:
closeness_pos = nx.closeness_centrality(G_pos)
betweenness_pos = nx.betweenness_centrality(G_pos)
eigenvector_pos = nx.eigenvector_centrality(G_pos)

In [None]:
pd.DataFrame.from_dict(closeness_conflicts, orient='index').to_csv('results/closeness_conflicts.csv')
pd.DataFrame.from_dict(betweenness_conflicts, orient='index').to_csv('results/betweenness_conflicts.csv')
pd.DataFrame.from_dict(eigenvector_conflicts, orient='index').to_csv('results/eigenvector_conflicts.csv')

pd.DataFrame.from_dict(closeness_full, orient='index').to_csv('results/closeness_full.csv')
pd.DataFrame.from_dict(betweenness_full, orient='index').to_csv('results/betweenness_full.csv')
pd.DataFrame.from_dict(eigenvector_full, orient='index').to_csv('results/eigenvector_full.csv')

pd.DataFrame.from_dict(closeness_neg, orient='index').to_csv('results/closeness_neg.csv')
pd.DataFrame.from_dict(betweenness_neg, orient='index').to_csv('results/betweenness_neg.csv')
pd.DataFrame.from_dict(eigenvector_neg, orient='index').to_csv('results/eigenvector_neg.csv')

pd.DataFrame.from_dict(closeness_pos, orient='index').to_csv('results/closeness_pos.csv')
pd.DataFrame.from_dict(betweenness_pos, orient='index').to_csv('results/betweenness_pos.csv')
pd.DataFrame.from_dict(eigenvector_pos, orient='index').to_csv('results/eigenvector_pos.csv')

## Similarity score in graphs

In [43]:
# similarity score of full graph
r_grouped = r.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).size().reset_index(name='counts')
r_grouped["SIMILARITY"] = r_grouped.apply(lambda x: similarity_score(x["SOURCE_SUBREDDIT"], x["TARGET_SUBREDDIT"]), axis=1)
r_grouped = r_grouped.dropna()

print('Mean similarity score of full graph: ', np.mean(r_grouped["SIMILARITY"]))

Mean similarity score of full graph:  0.2874672526046712


In [41]:
# similarity score of negative graph
r_neg_grouped = r_neg.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).size().reset_index(name='counts')
r_neg_grouped["SIMILARITY"] = r_neg_grouped.apply(lambda x: similarity_score(x["SOURCE_SUBREDDIT"], x["TARGET_SUBREDDIT"]), axis=1)
r_neg_grouped = r_neg_grouped.dropna()

print('Mean similarity score of negative graph: ', np.mean(r_neg_grouped["SIMILARITY"]))

Mean similarity score of negative graph:  0.3020378108951983


In [42]:
# similarity score of positive graph
r_pos_grouped = r_pos.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).size().reset_index(name='counts')
r_pos_grouped["SIMILARITY"] = r_pos_grouped.apply(lambda x: similarity_score(x["SOURCE_SUBREDDIT"], x["TARGET_SUBREDDIT"]), axis=1)
r_pos_grouped = r_pos_grouped.dropna()

print('Mean similarity score of positive graph: ', np.mean(r_pos_grouped["SIMILARITY"]))

Mean similarity score of positive graph:  0.28979982991020664


In [40]:
# similarity score of conflict graph
r_conflict_grouped = r_neg.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).size().reset_index(name='counts')
r_conflict_grouped["SIMILARITY"] = r_conflict_grouped.apply(lambda x: similarity_score(x["SOURCE_SUBREDDIT"], x["TARGET_SUBREDDIT"]), axis=1)
r_conflict_grouped = r_conflict_grouped.dropna()
r_conflict_grouped = r_conflict_grouped[r_conflict_grouped["counts"] > 9]

print('Mean similarity score of negative graph: ',
      np.mean(r_conflict_grouped["SIMILARITY"]))


Mean similarity score of negative graph:  0.35427542834280273


In [58]:
from sklearn.manifold import TSNE

# Load your dataframe
df = r_conflict_grouped

# Create a 2D array of similarity scores
similarity_scores = np.array(df['SIMILARITY']).reshape(-1, 1)

# Perform t-SNE to reduce the dimensionality of your data
tsne = TSNE(n_components=2, random_state=42)
tsne_scores = tsne.fit_transform(similarity_scores)

# Plot the results
plt.scatter(tsne_scores[:, 0], tsne_scores[:, 1])
plt.title('t-SNE plot of subreddit similarity scores')
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')
plt.show()





IndexError: index 1 is out of bounds for axis 1 with size 1