# Dataset Analysis

## Import

In [1]:
import csv
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# Dataset

In [2]:
# Read tsv file
r_body = pd.read_csv('soc-redditHyperlinks-body.tsv', sep='\t')
r_title = pd.read_csv('soc-redditHyperlinks-title.tsv', sep='\t')

# Merge two dataframes
r = pd.concat([r_body, r_title])
r = r.drop(['PROPERTIES'], axis=1)

r_neg = r[r['LINK_SENTIMENT'] == -1]
r_pos = r[r['LINK_SENTIMENT'] == 1]


In [3]:
r_body.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


In [4]:
r_title.sample(10)

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
430954,conspiracy,politics,515sqes,2016-09-04 13:27:05,1,"178.0,154.0,0.797752808989,0.0112359550562,0.0..."
456364,marvel,wyt10,58jomys,2016-10-20 15:05:17,1,"45.0,39.0,0.755555555556,0.0444444444444,0.177..."
11427,enoughlibertarianspam,libertarian,1xavnks,2014-02-07 13:26:03,1,"113.0,99.0,0.796460176991,0.0,0.0530973451327,..."
276905,shitpost,creepy,3s472ks,2015-11-09 10:16:33,1,"83.0,69.0,0.734939759036,0.0,0.0963855421687,0..."
74445,goldguysplayground,askreddit,2d2v6zs,2014-08-09 10:38:41,1,"103.0,86.0,0.766990291262,0.0,0.0873786407767,..."
256793,upvotedbecausegirl,funny,3mvk9ns,2015-09-29 22:20:56,1,"102.0,87.0,0.764705882353,0.0,0.0882352941176,..."
171775,fairshareloans,randomactsofpizza,326hzcs,2015-04-10 22:11:44,1,"88.0,77.0,0.829545454545,0.0,0.159090909091,0...."
433620,drama,news,51z1vxs,2016-09-09 11:55:31,1,"62.0,54.0,0.806451612903,0.0,0.806451612903,0...."
87467,tributeme,gonewildcurvy,2gbrkys,2014-09-13 15:27:09,1,"71.0,62.0,0.746478873239,0.0,0.197183098592,0...."
77617,oppression,technology,2dw0eos,2014-08-18 00:33:54,1,"113.0,98.0,0.796460176991,0.0,0.070796460177,0..."


In [5]:
print("Percentage of positive and negative links in title")
print(r_title['LINK_SENTIMENT'].value_counts(normalize=True))

print("\nPercentage of positive and negative links in body")
print(r_body['LINK_SENTIMENT'].value_counts(normalize=True))

Percentage of positive and negative links in title
 1    0.893098
-1    0.106902
Name: LINK_SENTIMENT, dtype: float64

Percentage of positive and negative links in body
 1    0.926473
-1    0.073527
Name: LINK_SENTIMENT, dtype: float64


In [6]:
# probability that a negative link is in the body
print("Probability that a negative link is in the body")
print(len(r_body[r_body['LINK_SENTIMENT'] == -1]) / len(r_neg))

# probability that a positive link is in the title
print("Probability that a positive link is in the title")
print(len(r_title[r_title['LINK_SENTIMENT'] == 1]) / len(r_neg))


Probability that a negative link is in the body
0.2562948546405547
Probability that a positive link is in the title
6.213197907797105


# Graph

In [7]:
# get graph with networkx
G_neg = nx.from_pandas_edgelist(r_neg, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT',edge_attr='LINK_SENTIMENT', create_using=nx.DiGraph())
G_pos = nx.from_pandas_edgelist(r_pos, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT',edge_attr='LINK_SENTIMENT', create_using=nx.DiGraph())
G = nx.from_pandas_edgelist(r, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT',edge_attr='LINK_SENTIMENT', create_using=nx.DiGraph())

# remove self loop
G_neg.remove_edges_from(nx.selfloop_edges(G_neg))
G_pos.remove_edges_from(nx.selfloop_edges(G_pos))
G.remove_edges_from(nx.selfloop_edges(G))

# remove nodes with degree 0
G_neg.remove_nodes_from(list(nx.isolates(G_neg)))
G_pos.remove_nodes_from(list(nx.isolates(G_pos)))
G.remove_nodes_from(list(nx.isolates(G)))

# print number of nodes
print('Number of nodes in negative graph: ', G_neg.number_of_nodes())
print('Number of nodes in positive graph: ', G_pos.number_of_nodes())
print('Number of nodes in total graph: ', G.number_of_nodes())


Number of nodes in negative graph:  12069
Number of nodes in positive graph:  65892
Number of nodes in total graph:  67180


In [8]:
deg_neg = dict(G_neg.degree())
deg_pos = dict(G_pos.degree())
deg = dict(G.degree())

max_deg_neg = max(deg_neg.values())
max_deg_pos = max(deg_pos.values())
max_deg = max(deg.values())

print('\nMax degree of negative graph: ', max_deg_neg)
print('Max degree of positive graph: ', max_deg_pos)
print('Max degree of total graph: ', max_deg)

# out degree
out_deg_neg = dict(G_neg.out_degree())
out_deg_pos = dict(G_pos.out_degree())
out_deg = dict(G.out_degree())

# in degree
in_deg_neg = dict(G_neg.in_degree())
in_deg_pos = dict(G_pos.in_degree())
in_deg = dict(G.in_degree())


Max degree of negative graph:  2056
Max degree of positive graph:  5504
Max degree of total graph:  5811


In [9]:
# sort out degree
out_deg_neg = {k: v for k, v in sorted(out_deg_neg.items(), key=lambda item: item[1], reverse=True)}
out_deg_pos = {k: v for k, v in sorted(out_deg_pos.items(), key=lambda item: item[1], reverse=True)}
out_deg = {k: v for k, v in sorted(out_deg.items(), key=lambda item: item[1], reverse=True)}

in_deg_neg = {k: v for k, v in sorted(in_deg_neg.items(), key=lambda item: item[1], reverse=True)}
in_deg_pos = {k: v for k, v in sorted(in_deg_pos.items(), key=lambda item: item[1], reverse=True)}
in_deg = {k: v for k, v in sorted(in_deg.items(), key=lambda item: item[1], reverse=True)}

print("total unique nodes: ", len(set(list(out_deg_neg.keys()) + list(out_deg_pos.keys()) + list(out_deg.keys()))))
print("nodes with out degree neg > 10: ", len([x for x in out_deg_neg.values() if x > 10])/len(out_deg))
print("nodes with in degree neg > 10: ", len([x for x in in_deg_neg.values() if x > 10])/len(out_deg))

total unique nodes:  67180
nodes with out degree neg > 10:  0.00821673116999107
nodes with in degree neg > 10:  0.009749925573087228


In [10]:
# mean out degree
mean_out_deg_neg = np.mean(list(out_deg_neg.values()))
mean_out_deg_pos = np.mean(list(out_deg_pos.values()))
mean_out_deg = np.mean(list(out_deg.values()))

print('\nMean out degree of negative graph: ', mean_out_deg_neg)
print('Mean out degree of positive graph: ', mean_out_deg_pos)
print('Mean out degree of total graph: ', mean_out_deg)


Mean out degree of negative graph:  3.411218825089071
Mean out degree of positive graph:  4.879788138165483
Mean out degree of total graph:  5.055715986900863


In [11]:
print("\nfirst 10 nodes with highest out degree in negative graph")
for i in range(10):
    print("\t", list(out_deg_neg.keys())[i], list(out_deg_neg.values())[i])

print("\nfirst 10 nodes with highest out degree in positive graph")
for i in range(10):
    print("\t", list(out_deg_pos.keys())[i], list(out_deg_pos.values())[i])

print("\nfirst 10 nodes with highest out degree in total graph")
for i in range(10):
    print("\t", list(out_deg.keys())[i], list(out_deg.values())[i])

print("\nfirst 10 nodes with highest in degree in negative graph")
for i in range(10):
    print("\t", list(in_deg_neg.keys())[i], list(in_deg_neg.values())[i])

print("\nfirst 10 nodes with highest in degree in positive graph")
for i in range(10):
    print("\t", list(in_deg_pos.keys())[i], list(in_deg_pos.values())[i])

print("\nfirst 10 nodes with highest in degree in total graph")
for i in range(10):
    print("\t", list(in_deg.keys())[i], list(in_deg.values())[i])




first 10 nodes with highest out degree in negative graph
	 subredditdrama 1759
	 bestof 1091
	 drama 839
	 botsrights 385
	 circlebroke2 361
	 shitpost 337
	 shitredditsays 293
	 shitliberalssay 278
	 shitamericanssay 273
	 the_donald 255

first 10 nodes with highest out degree in positive graph
	 bestof 2881
	 subredditdrama 2541
	 titlegore 2466
	 drama 1091
	 hailcorporate 907
	 switcharoo 901
	 shitredditsays 873
	 gaming 774
	 shitamericanssay 747
	 the_donald 716

first 10 nodes with highest out degree in total graph
	 bestof 3111
	 subredditdrama 3020
	 titlegore 2469
	 drama 1413
	 hailcorporate 939
	 shitredditsays 923
	 switcharoo 918
	 the_donald 798
	 shitamericanssay 793
	 botsrights 792

first 10 nodes with highest in degree in negative graph
	 askreddit 1108
	 funny 506
	 pics 500
	 worldnews 489
	 videos 472
	 todayilearned 460
	 news 445
	 iama 390
	 adviceanimals 332
	 politics 311

first 10 nodes with highest in degree in positive graph
	 askreddit 5165
	 iama 4404


In [12]:
# first 10 nodes with highest out degree neg
print('\nFirst 10 nodes with highest out degree neg:')
print('out degree neg \t\t out degree pos \t out degree \t\t node')
for n in list(out_deg_neg.keys())[:10]:
    print(out_deg_neg[n], "\t\t\t", out_deg_pos[n], "\t\t\t", out_deg[n], "\t\t\t", n)



First 10 nodes with highest out degree neg:
out degree neg 		 out degree pos 	 out degree 		 node
1759 			 2541 			 3020 			 subredditdrama
1091 			 2881 			 3111 			 bestof
839 			 1091 			 1413 			 drama
385 			 623 			 792 			 botsrights
361 			 629 			 730 			 circlebroke2
337 			 690 			 783 			 shitpost
293 			 873 			 923 			 shitredditsays
278 			 599 			 662 			 shitliberalssay
273 			 747 			 793 			 shitamericanssay
255 			 716 			 798 			 the_donald


## Conflicts

In [13]:
conflicts = r_neg.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).size().reset_index(name='counts')
conflicts = conflicts.sort_values(by=['counts'], ascending=False)
conflicts = conflicts[conflicts['counts'] > 10]
conflicts.head(10)

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,counts
31082,shitredditsays,twoxchromosomes,120
34945,subredditdrama,relationships,100
14431,evenwithcontext,askreddit,97
1102,amrsucks,againstmensrights,89
33765,subredditdrama,askreddit,88
34837,subredditdrama,pics,85
32907,srssucks,shitredditsays,79
35219,subredditdrama,todayilearned,79
35314,subredditdrama,videos,78
9006,circlebroke2,pics,77


In [14]:
# % of nodes that are in conflict 
node_in_conflicts = set(conflicts["SOURCE_SUBREDDIT"].unique()).union(set(conflicts["TARGET_SUBREDDIT"].unique()))
print('Number of nodes in conflict: ', len(node_in_conflicts))
print('Number of nodes in total graph: ', G.number_of_nodes())
print('Percentage of nodes in conflict: ', len(node_in_conflicts)/G.number_of_nodes())

attacker_nodes = set(conflicts["SOURCE_SUBREDDIT"].unique())
print('\nNumber of attacker nodes: ', len(attacker_nodes))
print('Number of nodes in total graph: ', G.number_of_nodes())
print('Percentage of attacker nodes: ', len(attacker_nodes)/G.number_of_nodes())

attacked_nodes = set(conflicts["TARGET_SUBREDDIT"].unique())
print('\nNumber of attacked nodes: ', len(attacked_nodes))
print('Number of nodes in total graph: ', G.number_of_nodes())
print('Percentage of attacked nodes: ', len(attacked_nodes)/G.number_of_nodes())

# % of attacker nodes that are also attacked
print('\nNumber of attacker nodes that are also attacked: ', len(attacker_nodes.intersection(attacked_nodes)))
print('Number of attacker nodes: ', len(attacker_nodes))
print('Percentage of attacker nodes that are also attacked: ', len(attacker_nodes.intersection(attacked_nodes))/len(attacker_nodes))


Number of nodes in conflict:  530
Number of nodes in total graph:  67180
Percentage of nodes in conflict:  0.007889252753795772

Number of attacker nodes:  240
Number of nodes in total graph:  67180
Percentage of attacker nodes:  0.003572491813039595

Number of attacked nodes:  365
Number of nodes in total graph:  67180
Percentage of attacked nodes:  0.005433164632331051

Number of attacker nodes that are also attacked:  75
Number of attacker nodes:  240
Percentage of attacker nodes that are also attacked:  0.3125


In [15]:
G_conflicts = nx.from_pandas_edgelist(conflicts, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', edge_attr='counts', create_using=nx.DiGraph())

# Measures

## Conflict graph

In [16]:
# in degree
in_deg_conflicts = dict(G_conflicts.in_degree())
in_deg_conflicts = {k: v for k, v in sorted(in_deg_conflicts.items(), key=lambda item: item[1], reverse=True)}

# out degree
out_deg_conflicts = dict(G_conflicts.out_degree())
out_deg_conflicts = {k: v for k, v in sorted(out_deg_conflicts.items(), key=lambda item: item[1], reverse=True)}

# closeness centrality
closeness_conflicts = nx.closeness_centrality(G_conflicts)
closeness_conflicts = {k: v for k, v in sorted(closeness_conflicts.items(), key=lambda item: item[1], reverse=True)}

# betweenness centrality
betweenness_conflicts = nx.betweenness_centrality(G_conflicts)
betweenness_conflicts = {k: v for k, v in sorted(betweenness_conflicts.items(), key=lambda item: item[1], reverse=True)}

# eigenvector centrality
eigenvector_conflicts = nx.eigenvector_centrality(G_conflicts)
eigenvector_conflicts = {k: v for k, v in sorted(eigenvector_conflicts.items(), key=lambda item: item[1], reverse=True)}

## Full graph

In [19]:
closeness_full = nx.closeness_centrality(G)
betweenness_full = nx.betweenness_centrality(G)
eigenvector_full = nx.eigenvector_centrality(G)

## Negative Graph

In [None]:
closeness_neg = nx.closeness_centrality(G_neg)
betweenness_neg = nx.betweenness_centrality(G_neg)
eigenvector_neg = nx.eigenvector_centrality(G_neg)

## Positive Graph

In [None]:
closeness_pos = nx.closeness_centrality(G_pos)
betweenness_pos = nx.betweenness_centrality(G_pos)
eigenvector_pos = nx.eigenvector_centrality(G_pos)

In [None]:
pd.DataFrame.from_dict(closeness_conflicts, orient='index').to_csv('results/closeness_conflicts.csv')
pd.DataFrame.from_dict(betweenness_conflicts, orient='index').to_csv('results/betweenness_conflicts.csv')
pd.DataFrame.from_dict(eigenvector_conflicts, orient='index').to_csv('results/eigenvector_conflicts.csv')

pd.DataFrame.from_dict(closeness_full, orient='index').to_csv('results/closeness_full.csv')
pd.DataFrame.from_dict(betweenness_full, orient='index').to_csv('results/betweenness_full.csv')
pd.DataFrame.from_dict(eigenvector_full, orient='index').to_csv('results/eigenvector_full.csv')

pd.DataFrame.from_dict(closeness_neg, orient='index').to_csv('results/closeness_neg.csv')
pd.DataFrame.from_dict(betweenness_neg, orient='index').to_csv('results/betweenness_neg.csv')
pd.DataFrame.from_dict(eigenvector_neg, orient='index').to_csv('results/eigenvector_neg.csv')

pd.DataFrame.from_dict(closeness_pos, orient='index').to_csv('results/closeness_pos.csv')
pd.DataFrame.from_dict(betweenness_pos, orient='index').to_csv('results/betweenness_pos.csv')
pd.DataFrame.from_dict(eigenvector_pos, orient='index').to_csv('results/eigenvector_pos.csv')