# Import

In [113]:
import csv
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl


# Function

In [125]:
def df_source_target(source, sentiment):
    df = pd.DataFrame()

    df = source[['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']]

    # count total number of link_sentiment == sentiment for each subreddit
    df.insert(2,'TOTAL_LINK',source.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT'])[
        'LINK_SENTIMENT'].transform(lambda x: (x == sentiment).sum()))

    # order dataframe by TOTAL_LINK DESC
    df = df.sort_values(by=['TOTAL_LINK'], ascending=False)

    return df

def gen_source_sink_hub(sentiment_df):
    r_body_source = pd.DataFrame()
    r_body_target = pd.DataFrame()

    # df con tutti i nodi sorgente  
    r_body_source = sentiment_df['SOURCE_SUBREDDIT'] 
    # delete the duplicate
    r_body_source = r_body_source.drop_duplicates()
    # reset index
    r_body_source = r_body_source.reset_index()
    #print("Numero di sorgenti con negative sentiment: ",len(r_body_source))

    # df con tutti i nodi target
    r_body_target = sentiment_df['TARGET_SUBREDDIT']
    # delete the duplicate
    r_body_target = sentiment_df.drop_duplicates()
    # reset index
    r_body_target = r_body_target.reset_index()
    #print("Numero di target con negative sentiment: ",len(r_body_target))

    # build a df as difference between source and target
    source = pd.DataFrame()
    target = pd.DataFrame()
    hub = pd.DataFrame()

    #drop from r_body_source all element present in r_target
    source = r_body_source[~r_body_source['SOURCE_SUBREDDIT'].isin(r_body_target['TARGET_SUBREDDIT'])]
    #drop from r_body_target all element present in r_source
    target = r_body_target[~r_body_target['TARGET_SUBREDDIT'].isin(r_body_source['SOURCE_SUBREDDIT'])]

    # insert in hub all element of r_body_source
    hub = pd.concat([r_body_source,r_body_target])
    # drop all duplicate
    hub = hub.drop_duplicates()
    # remove all element present in source
    hub = hub[~hub['SOURCE_SUBREDDIT'].isin(source['SOURCE_SUBREDDIT'])]
    # remove all element present in target
    hub = hub[~hub['TARGET_SUBREDDIT'].isin(target['TARGET_SUBREDDIT'])]
    # reset index
    hub = hub.reset_index()

    return source,target,hub


def gen_graph(df):
    # Create graph
    G = nx.from_pandas_edgelist(df, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT',edge_attr='TOTAL_LINK', create_using=nx.DiGraph())

    # Remove self-loop
    G.remove_edges_from(nx.selfloop_edges(G))

    # Remove nodes with degree 0
    G.remove_nodes_from(list(nx.isolates(G)))

    return G


def gen_plot(G, title):
    # Set layout
    pos = nx.spring_layout(G, k=0.1, iterations=20)

    # Set node size
    node_size = [G.degree(v) * 10 for v in G]

    # Set node color
    node_color = [G.degree(v) for v in G]

    # Set edge width
    #edge_width = [d['TOTAL_LINK'] / 5 for (u, v, d) in G.edges(data=True)]

    # Set node label
    node_label = {v: v for v in G}

    # Set edge label
    edge_label = {(u, v): d['TOTAL_LINK'] for (u, v, d) in G.edges(data=True)}

    # Set colormap
    cmap = plt.cm.get_cmap('viridis_r')

    # Set figure size
    plt.figure(figsize=(20, 20))

    # Draw graph
    nx.draw_networkx_nodes(G, pos, node_size=node_size,
                           node_color=node_color, cmap=cmap)
    nx.draw_networkx_edges(G, pos, width=1, edge_color='lightgray')
    #nx.draw_networkx_labels(G, pos, labels=node_label, font_size=10)
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_label, font_size=8)

    # Set title
    plt.title(title, fontsize=20)

    # Set colorbar
    sm = plt.cm.ScalarMappable(
        cmap=cmap, norm=plt.Normalize(vmin=0, vmax=max(node_color)))
    sm._A = []
    plt.colorbar(sm)

    # Show plot
    plt.show()


def statistics(df):
    # make mean of all the counts
    mean = df['TOTAL_LINK'].mean()

    # make median of all the counts
    median = df['TOTAL_LINK'].median()

    # make standard deviation of all the counts
    std = df['TOTAL_LINK'].std()

    # make variance of all the counts
    var = df['TOTAL_LINK'].var()

    # make max of all the counts
    max = df['TOTAL_LINK'].max()

    # make min of all the counts
    min = df['TOTAL_LINK'].min()

    # make sum of all the counts
    sum = df['TOTAL_LINK'].sum()

    # make count of all the counts
    count = df['TOTAL_LINK'].count()

    # make mode of all the counts
    mode = df['TOTAL_LINK'].mode()

    # count all element of r where count > 10
    # !516
    r_max10 = df[df['TOTAL_LINK'] > 10].count()

    # !5659
    r_min_eq_10 = df[df['TOTAL_LINK'] <= 10].count()

    # print all as table
    print('mean: ', mean)
    print('median: ', median)
    print('std: ', std)
    print('var: ', var)
    print('max: ', max)
    print('min: ', min)
    print('sum: ', sum)
    print('count: ', count)
    print('mode: ', mode)
    print('r_max10: ', r_max10)
    print('r_min10: ', r_min_eq_10)


# Read CSV

In [115]:
# Read tsv file
r_body = pd.read_csv('soc-redditHyperlinks-body.tsv', sep='\t')
r_title = pd.read_csv('soc-redditHyperlinks-title.tsv', sep='\t')

### Dividiamo r_body ed r_title in link negativi e link positivi

In [116]:
r_body_negative = r_body[r_body['LINK_SENTIMENT'] == -1]
r_body_positive = r_body[r_body['LINK_SENTIMENT'] == 1]
r_title_negative = r_title[r_title['LINK_SENTIMENT'] == -1]
r_title_positive = r_title[r_title['LINK_SENTIMENT'] == 1]

In [124]:
print("************************REDDIT WITH NEGATIVE SENTIMENT*******************************\n\n")
print(r_body_negative)
print("\n\n************************REDDIT WITH POSITIVE SENTIMENT*******************************\n\n")
print(r_body_positive)

************************REDDIT WITH NEGATIVE SENTIMENT*******************************


       SOURCE_SUBREDDIT      TARGET_SUBREDDIT POST_ID            TIMESTAMP  \
1            theredlion                soccer  1u4qkd  2013-12-31 18:18:37   
34      karmaconspiracy                 funny  1u6fz3  2014-01-01 12:44:19   
43             badkarma              gamesell  1u6t4g  2014-01-01 16:42:14   
53           casualiama             teenagers  1u70s8  2014-01-01 17:09:46   
55            australia                sydney  1u71zd  2014-01-01 17:24:46   
...                 ...                   ...     ...                  ...   
286475      badpolitics  bannedfromthe_donald  68h72b  2017-04-30 11:57:36   
286491    tipofmytongue             deathcore  68hbx9  2017-04-30 12:34:09   
286501       soundcloud                procss  68hi5v  2017-04-30 12:16:13   
286523  enoughtrumpspam       humansbeingbros  68hxu6  2017-04-30 14:25:18   
286554   subredditdrama                 funny  68iigy 

# Dataframe

### generiamo tutte le coppie sorgente - target. Per ogni coppia teniamo traccia del numero totali di interazioni con il target

In [118]:
# df with all couple SOURCE TARGET with positive sentiment
reddit_positive = df_source_target(r_body_positive, 1)

# df with all couple SOURCE TARGET with negative sentiment
reddit_negative = df_source_target(r_body_negative, -1)


In [119]:
print(len(reddit_negative))
print(len(reddit_positive))

21070
265491


### In the context of a network, a _Hub_ is a node with a large degree, meaning it has connections with many other nodes. A node is considered a *Source* in a graph if it has in-degree of 0 (no nodes have a source as their destination); likewise, a node is considered a _Sink_ in a graph if it has out-degree of 0 (no nodes have a sink as their source).

In [148]:
source_negative,sink_negative,hub_negative = gen_source_sink_hub(r_body_negative)
source_positive,sink_positive,hub_positive = gen_source_sink_hub(r_body_positive)

# # check if element in source are in target
# print(source_negative[source_negative['SOURCE_SUBREDDIT'].isin(target_negative['TARGET_SUBREDDIT'])])

# # check if element in target are in source
# print(target_negative[target_negative['TARGET_SUBREDDIT'].isin(source_negative['SOURCE_SUBREDDIT'])])

In [149]:
print("Numero di nodi source con negative sentiment: ",len(source_negative))
print("Numero di nodi sink con negative sentiment: ",len(sink_negative))
print("Numero di nodi hub con negative sentiment: ",len(hub_negative))

print("Numero di nodi source con positive sentiment: ",len(source_positive))
print("Numero di nodi sink con positive sentiment: ",len(sink_positive))
print("Numero di nodi hub con positive sentiment: ",len(hub_positive))

Numero di nodi source con negative sentiment:  2463
Numero di nodi sink con negative sentiment:  6538
Numero di nodi hub con negative sentiment:  13269
Numero di nodi source con positive sentiment:  14895
Numero di nodi sink con positive sentiment:  25716
Numero di nodi hub con positive sentiment:  222950


In [157]:
source_negative_2 = reddit_negative[reddit_negative['SOURCE_SUBREDDIT'].isin(source_negative['SOURCE_SUBREDDIT'])]

print(source_negative_2.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT'])['SOURCE_SUBREDDIT'])

source_negative_2

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc99067db70>


Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,TOTAL_LINK
8028,circloljerk,leagueoflegends,30
40790,circloljerk,leagueoflegends,30
78916,circloljerk,leagueoflegends,30
69955,circloljerk,leagueoflegends,30
79474,circloljerk,leagueoflegends,30
...,...,...,...
122616,botwatch,requestabot,1
122634,stlouiscirclejerk,stlouis,1
122692,casualpokemontrades,pokemonplaza,1
122707,compulsiveskinpicking,todayilearned,1


# Statistics

In [None]:
statistics(reddit_negative)
# statistics(reddit_positive)


# Graph visualization

## Negative sentiment

In [None]:
G_negative = gen_graph(reddit_negative[reddit_negative['TOTAL_LINK'] > 10])
gen_plot(G_negative, 'negative sentiment')


## Positive sentiment

In [None]:
G_positive = gen_graph(reddit_positive[reddit_positive['TOTAL_LINK'] > 10])
gen_plot(G_positive, 'positive sentiment')


In [None]:
# show rows where link_sentiment is -1 (negative) and group by subreddit and count the number of rows
r = r_body[r_body['LINK_SENTIMENT'] == -1].groupby('SOURCE_SUBREDDIT')['LINK_SENTIMENT'].count()

# order by count
r = r.sort_values(ascending=False)
r


In [None]:
# show rows where link_sentiment is -1 (negative) and group by subreddit and count the number of rows
r = r_title[r_title['LINK_SENTIMENT'] == -1].groupby('SOURCE_SUBREDDIT')['LINK_SENTIMENT'].count()

# order by count
r = r.sort_values(ascending=False)
