In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np

comments_path = 'ASM_PZ2_podaci_2122/reddit2008/comments_2008_asm/csv-{}.csv'
comments_list = []
for i in range(0, 12):
    comments_list.append(pd.read_csv(comments_path.format(i)))

comments_dtypes = {
    "id": object,
    "author": object,
    "link_id": object,
    "parent_id": object,
    "created_utc": int,
    "subreddit": object,
    "subreddit_id": object,
    "score": int,
    "distinguished": object,
    "gilded": int,
    "controversiality": int
}
#comments_data = comments_data.astype(comments_dtypes)
    
comments_data = pd.concat(comments_list)
comments_list = []
comments_data.reset_index(inplace = True, drop = True)

print(comments_data.shape)

In [None]:
submissions_path = 'ASM_PZ2_podaci_2122/reddit2008/submissions_2008_asm/csv-{}.csv'
submissions_list = []
for i in range(0, 12):
    submissions_list.append(pd.read_csv(submissions_path.format(i)))

submissions_dtypes = {
    "id": object,
    "url": object,
    "permalink": object,
    "author": object,
    "created_utc": int,
    "subreddit": object,
    "subreddit_id": object,
    "num_comments": int,
    "score": int,
    "over_18": bool,
    "distinguished": object,
    "domain": object,
    "stickied": bool,
    "locked": bool,
    "hide_score": bool
}
#submissions_data = submissions_data.astype(submissions_dtypes)
    
submissions_data = pd.concat(submissions_list)
submissions_list = []
submissions_data.reset_index(inplace = True, drop = True)

print(submissions_data.shape)

# Data Cleansing

In [None]:
print(comments_data['id'].isnull().values.any())
print(comments_data['id'].is_unique)

print(submissions_data['id'].isnull().values.any())
print(submissions_data['id'].is_unique)

comments_null_id = comments_data[comments_data['id'].isnull()]
print("\n", comments_null_id)

print("\n", comments_data.iloc[6422486:6422489, :])

comments_data.loc[comments_data['id'].isnull(), 'id'] = "nan"

print("\nAfter id null fix:\n", comments_data.iloc[6422486:6422489, :])

In [None]:
# Unnamed column COMMENTS
print(comments_data.columns)
print(comments_data["Unnamed: 0"])

comments_data = comments_data.drop(columns=["Unnamed: 0"])

print("\n", comments_data.columns)

# Unnamed column SUBMISSIONS
print("\n", submissions_data.columns)
print(submissions_data["Unnamed: 0"])

submissions_data = submissions_data.drop(columns=["Unnamed: 0"])

print("\n", submissions_data.columns)

In [None]:
comments_contr_not_zero = comments_data[comments_data['controversiality'] != 0]
print("\n", comments_contr_not_zero)

In [None]:
# check for null columns - COMMENTS
nan_values = comments_data.isna()
nan_columns = nan_values.any()
columns_with_nan = comments_data.columns[nan_columns].tolist()
print(columns_with_nan)

In [None]:
comments_dis_not_null = comments_data[comments_data['distinguished'].isnull() == False]
print("\n", comments_dis_not_null.shape)

In [None]:
# check for null columns - SUBMISSIONS
nan_values = submissions_data.isna()
nan_columns = nan_values.any()
columns_with_nan = submissions_data.columns[nan_columns].tolist()
print(columns_with_nan)

In [None]:
submissions_dis_not_null = submissions_data[submissions_data['distinguished'].isnull() == False]
print("\n", submissions_dis_not_null.shape)

submissions_domain_not_null = submissions_data[submissions_data['domain'].isnull() == True]
print("\n", submissions_domain_not_null.shape)
#print("\n", submissions_domain_not_null)

In [None]:
# Data types check
print(comments_data.dtypes, "\n")
print(submissions_data.dtypes)

# Statistička obrada podataka (3.4.1.)

In [None]:

# 1) Koliko postoji različitih sabredita koji se pojavljuju u posmatranom periodu? Koji su najvažniji po broju korisnika, a koji po broju komentara? 

submissions_subreddit_columns = submissions_data.loc[:,'subreddit' : 'subreddit_id']
submissions_subreddit_columns = submissions_subreddit_columns.drop_duplicates()
print("SUBMISSIONS Subreddits:\n", submissions_subreddit_columns.shape, "\n")

comments_subreddit_columns = comments_data.loc[:, 'subreddit' : 'subreddit_id']
comments_subreddit_columns = comments_subreddit_columns.drop_duplicates()
print("COMMENTS Subreddits:\n", comments_subreddit_columns.shape, "\n")

subreddit_columns = pd.concat([submissions_subreddit_columns, comments_subreddit_columns])
print("ALL Subreddits:\n", subreddit_columns.shape, "\n")

subreddit_columns = subreddit_columns.drop_duplicates()
print("UNIQUE Subreddit pairs:\n", subreddit_columns.shape, "\n")

#---------------------------------------------------------------------------------------------

# check for nonuniqueness (subreddits, subreddit IDs)
print("\nCHECK FOR NONUNIQUENESS\n")
print("submissions_data subreddit_id - is unique: ", submissions_subreddit_columns['subreddit_id'].is_unique)
print("comments_data subreddit_id - is unique: ", comments_subreddit_columns['subreddit_id'].is_unique)
print("Column subreddit - is unique: ", subreddit_columns['subreddit'].is_unique)
print("Column subreddit_id - is unique: ", subreddit_columns['subreddit_id'].is_unique, "\n")

data_grouped = subreddit_columns[['subreddit', 'subreddit_id']].groupby('subreddit_id')

data_aggregated = data_grouped['subreddit_id'].agg(np.size)
#print(data_aggregated, "\n")

data_nonunique = data_aggregated[data_aggregated > 1]
print(data_nonunique, "\n")

target_match_list = data_nonunique.keys()[:]
#print("Nonunique IDs:", target_match_list, "\n")

data_target = subreddit_columns[subreddit_columns['subreddit_id'].isin(target_match_list)]
print(data_target, "\n")

print("COMMENTS data rows with target_match IDs:\n")
comments_target_rows = comments_data[comments_data['subreddit_id'].isin(target_match_list)]
comments_target_rows = comments_target_rows.drop_duplicates('subreddit_id')
print(comments_target_rows, "\n")

print("SUBMISSIONS data rows with target_match IDs:\n")
submissions_target_rows = submissions_data[submissions_data['subreddit_id'].isin(target_match_list)]
submissions_target_rows = submissions_target_rows.drop_duplicates('subreddit_id')
print(submissions_target_rows)


In [None]:
print("Before:\n", subreddit_columns, "\n")
unique_subreddits_1 = subreddit_columns[~((subreddit_columns['subreddit'] == '_Descary') & (subreddit_columns['subreddit_id'] == 't5_2qj0s'))]
unique_subreddits = unique_subreddits_1[~((unique_subreddits_1['subreddit'] == '__Journalism') & (unique_subreddits_1['subreddit_id'] == 't5_2qhyl'))]
print("After:\n", unique_subreddits)

# Q: Koliko postoji različitih sabredita koji se pojavljuju u posmatranom periodu? 
# ANSWER: 5032

# Changing subreddit (with same IDs) names from _Name to Name ##### TREBALO BI PROMENITI I permalink
submissions_data.loc[submissions_data['subreddit_id'] == 't5_2qj0s', 'subreddit'] = 'Descary'
submissions_data.loc[submissions_data['subreddit_id'] == 't5_2qhyl', 'subreddit'] = 'Journalism'

submissions_rows = submissions_data[submissions_data['subreddit_id'].isin(target_match_list)]
submissions_rows = submissions_rows.drop_duplicates('subreddit_id')
print(submissions_rows)

In [None]:
print("SUBREDDIT WITH MAX NUMBER OF COMMENTS:\n")
submissions_num_comments = submissions_data.groupby(["subreddit","subreddit_id"]).num_comments.sum().reset_index()
print("Subreddits num_comments:\n", submissions_num_comments, "\n")

max_num_comments = submissions_num_comments['num_comments'].idxmax()
max_num_comments_row = submissions_num_comments.iloc[max_num_comments]
print("Subreddits with MAX num of comments:\n", max_num_comments_row, "\n")

max_num_comments_10_rows = submissions_num_comments.nlargest(10, 'num_comments')
print("First 10 subreddits with MAX num of comments:\n", max_num_comments_10_rows, "\n")

# Q: Koji su najvažniji po broju korisnika, a koji po broju komentara?
# A: Po broju komentara: reddit.com (1768764), politics(1059618), programming(421137), ...


In [None]:
print("SUBREDDIT WITH MAX NUMBER OF USERS:\n")
submissions_authors = submissions_data[['subreddit', 'subreddit_id', 'author']]
print("Subreddits authors:\n", submissions_authors, "\n")

submissions_authors_unique = submissions_authors.drop_duplicates()
print("Subreddits authors without duplicates:\n", submissions_authors_unique, "\n")

submissions_num_authors = submissions_authors_unique.groupby(["subreddit","subreddit_id"]).author.count().reset_index()
print("Subreddit num_authors:\n", submissions_num_authors, "\n")

max_num_authors = submissions_num_authors['author'].idxmax()
max_num_authors_row = submissions_num_authors.iloc[max_num_authors]
print("Subreddits with MAX num of authors:\n", max_num_authors_row, "\n")

max_num_authors_10_rows = submissions_num_authors.nlargest(5, 'author')
print("First 10 subreddits with MAX num of authors:\n", max_num_authors_10_rows, "\n")

# sta raditi sa celijama gde je author==[deleted] ??

# Q: Koji  su najvažniji po broju korisnika, a koji po broju komentara?
# A: Po broju korisnika: reddit.com (138153), politics(15250), business(13009), ...

In [None]:
# 2) Kakav je prosečan broj zabeleženih korisnika aktivnih u posmatranom periodu po sabreditu? 
# Korisnik se smatra aktivnim na sabreditu ako je zabeležen barem jedan komentar ili objava tog korisnika.

print("AUTHORS PER SUBMISSION:\n")
submissions_authors = submissions_data[['subreddit', 'subreddit_id', 'author']]
print("Submissions authors:\n", submissions_authors, "\n")

submissions_authors_unique = submissions_authors.drop_duplicates()
print("Submissions authors without duplicates:\n", submissions_authors_unique, "\n")

print("AUTHORS PER COMMENT:\n")
comments_authors = comments_data[['subreddit', 'subreddit_id', 'author']]
print("Comments authors:\n", comments_authors, "\n")

comments_authors_unique = comments_authors.drop_duplicates()
print("Comments authors without duplicates:\n", comments_authors_unique, "\n")

print("AUTHORS PER COMMENT AND PER SUBMISSION:")
subreddit_authors = pd.concat([comments_authors_unique, submissions_authors_unique])
print(subreddit_authors.shape, "\n")

subreddit_authors_unique = subreddit_authors.drop_duplicates()
print("Subreddit authors without duplicates:\n", subreddit_authors_unique, "\n")

subreddit_num_authors = subreddit_authors_unique.groupby(["subreddit","subreddit_id"]).author.count().reset_index()
print("Subreddit num_authors:\n", subreddit_num_authors, "\n")

subreddit_sum_authors = subreddit_num_authors['author'].sum()
subreddit_authors_num_rows = subreddit_num_authors.shape[0]
subreddit_authors_avg = round(subreddit_sum_authors/subreddit_authors_num_rows)
print("Average number of authors per subreddit:\n", subreddit_authors_avg)

# Q: Kakav je prosečan broj zabeleženih korisnika aktivnih u posmatranom periodu po sabreditu?
# A: 129

In [None]:
# 3) Ko su korisnici sa najvećim brojem objava, a ko korisnici sa najvećim brojem komentara? 

print("MAX SUBMISSIONS USERS:\n")
max_submissions_users = submissions_data.author.value_counts()[:6]
print("With deleted:\n", max_submissions_users)
max_submissions_users_not_del = max_submissions_users.tail(max_submissions_users.shape[0] - 1)
print("\nUsers with max_submissions:\n", max_submissions_users_not_del)

# Q: Ko su korisnici sa najvećim brojem objava, a ko korisnici sa najvećim brojem komentara? 
# A: Korisnici sa najvećim brojem objava: gst(18870), qgyh2(12238), ...

print("\nMAX COMMENTS USERS:\n")
max_comments_users = comments_data.author.value_counts()[:6]
print("With deleted:\n", max_comments_users)
max_comments_users_not_del = max_comments_users.tail(max_comments_users.shape[0] - 1)
print("\nUsers with max_comments:\n", max_comments_users_not_del)

# Q: Ko su korisnici sa najvećim brojem objava, a ko korisnici sa najvećim brojem komentara? 
# A: Korisnici sa najvećim brojem komentara: NoMoreNicksLeft(13480), malcontent(12159), ...

In [None]:
# 4) Koji korisnici su aktivni na najvećem broju sabredita? Na koliko su sabredita aktivni?

print("SUBMISSION AUTHORS PER SUBREDDIT:\n")
submissions_authors_4 = submissions_data[['subreddit', 'subreddit_id', 'author']]
print("Submissions authors:\n", submissions_authors_4.shape, "\n")

submissions_authors_unique_4 = submissions_authors_4.drop_duplicates()
print("Submissions authors without duplicates:\n", submissions_authors_unique_4, "\n")

print("COMMENT AUTHORS PER SUBREDDIT:\n")
comments_authors_4 = comments_data[['subreddit', 'subreddit_id', 'author']]
print("Comments authors:\n", comments_authors_4.shape, "\n")

comments_authors_unique_4 = comments_authors_4.drop_duplicates()
print("Comments authors without duplicates:\n", comments_authors_unique_4, "\n")

print("COMMENT AND SUBMISSION AUTHORS PER SUBREDDIT:")
subreddit_authors_4 = pd.concat([comments_authors_unique_4, submissions_authors_unique_4])
print(subreddit_authors_4.shape, "\n")

print("MAX SUBREDDIT USERS:\n")
max_subreddit_users_4 = subreddit_authors_4.author.value_counts()[:6]
print("With deleted:\n", max_subreddit_users_4, "\n")
max_subreddit_users_not_del_4 = max_subreddit_users_4.tail(max_subreddit_users_4.shape[0] - 1)
print("Users active on max_subreddits:\n", max_subreddit_users_not_del_4)

# Q: Koji korisnici su aktivni na najvećem broju sabredita? Na koliko su sabredita aktivni?
# A: MrKlaatu(181), Escafane(154), ...

In [None]:
# 5) Kako su korelisani brojevi objava i brojevi komentara korisnika? Odrediti Pirsonov koeficijent korelacije i izvršiti vizuelizaciju.
import scipy.stats as sc
from matplotlib import pyplot as plt

print("MAX SUBMISSIONS USERS:\n")
max_submissions_users = submissions_data.author.value_counts().reset_index()
max_submissions_users.columns = ['author', 'sub_count']
max_submissions_users = max_submissions_users[max_submissions_users['author'] != '[deleted]']
print("With deleted:\n", max_submissions_users)

print("\nMAX COMMENTS USERS:\n")
max_comments_users = comments_data.author.value_counts().reset_index()
max_comments_users.columns = ['author', 'com_count']
max_comments_users = max_comments_users[max_comments_users['author'] != '[deleted]']
print("With deleted:\n", max_comments_users)

# from pathlib import Path  
# filepath = Path('out.csv')
# corell.to_csv(filepath)

corell = pd.merge(max_submissions_users, max_comments_users, how="outer", on="author")
corell = corell.fillna(0)

corell_dtypes = {
    "sub_count": int,
    "author": object,
    "com_count": int
}
corell = corell.astype(corell_dtypes)
print(corell)

x = corell['sub_count'].to_list()
y = corell['com_count'].to_list()

plt.plot(x, y, 'o')
plt.xlabel("submissions")
plt.ylabel("comments")
plt.show()

p_coeff = sc.pearsonr(x, y)
print('Pearson correlation coefficient:', p_coeff[0])

In [None]:
# 6) Koje objave poseduju najveći broj komentara i na kojim su sabreditima postavljene? 
# Prikazati podatke o tim objavama, uključujući to na kojem su sabreditu postavljene 
# i šta im je sadržaj (ako je polje objave “over 18” postavljeno na false).

submissions_sorted_num_comments = submissions_data.sort_values(by='num_comments', ascending=False)
# submissions_sorted_num_comments = submissions_sorted_num_comments[submissions_sorted_num_comments['over_18'] == False]

submissions_sorted_num_comments_first10 = submissions_sorted_num_comments[:10] 
print("First 10 submissions sorted by number of comments:\n", submissions_sorted_num_comments_first10, "\n")

# Q: Koje objave poseduju najveći broj komentara i na kojim su sabreditima postavljene? 
# A: submission_id(subreddit, num_comments) = 6nz1k(science, 33329), 78n1v(WTF, 3657), ...

urls = submissions_sorted_num_comments_first10['url'].to_list()
print(urls)

### Modelovanje mreže

In [None]:
import networkx as nx

In [None]:
G = nx.Graph()
G.add_nodes_from(set(unique_subreddits['subreddit_id']))

In [None]:
sub_subreddit_id_author = submissions_data[['subreddit_id', 'author']]
com_subreddit_id_author = comments_data[['subreddit_id', 'author']]

print("Listing subreddits and authors:\n")
print(sub_subreddit_id_author, '\n')
print(com_subreddit_id_author, '\n')

sub_subreddit_id_author = sub_subreddit_id_author[sub_subreddit_id_author['author'] != '[deleted]']
com_subreddit_id_author = com_subreddit_id_author[com_subreddit_id_author['author'] != '[deleted]']

print("After removed '[deleted]':\n")
print(sub_subreddit_id_author, '\n')
print(com_subreddit_id_author, '\n')

sub_subreddit_id_author = sub_subreddit_id_author.drop_duplicates()
com_subreddit_id_author = com_subreddit_id_author.drop_duplicates()

print("After dropped duplicates:\n")
print(sub_subreddit_id_author, '\n')
print(com_subreddit_id_author, '\n')

subreddit_id_author = pd.concat([sub_subreddit_id_author, com_subreddit_id_author])
subreddit_id_author = subreddit_id_author.drop_duplicates()
subreddit_id_author.reset_index(inplace = True, drop = True)

print("After concationation and dropped duplicates:\n")
print(subreddit_id_author, '\n')

In [None]:
grouped_by_author = subreddit_id_author.groupby('author')

for _, g in grouped_by_author:
    sub_list = list(g.loc[:, 'subreddit_id'])
    
    if len(sub_list) > 1:
        for i in range(len(sub_list) - 1):
            for j in range(i + 1, len(sub_list)):
                if (sub_list[i], sub_list[j]) in G.edges:
                    G.edges[sub_list[i], sub_list[j]]['weight'] += 1
                else:
                    G.add_edge(sub_list[i], sub_list[j], weight=1)
                    

In [None]:
output_path = "models/our_model.gml"

nx.write_gml(G, output_path)

# Osnovna karakterizacija modelovanih mreža (3.4.2.)

In [None]:
input_path = "models/our_model.gml"
G = nx.read_gml(input_path)

In [None]:
print("Graph: ", nx.info(G))
print("Graph isolates: ", nx.number_of_isolates(G))

In [None]:
def get_largest_component(gra):
    largest_cc = max(nx.connected_components(gra), key=len)
    gra_dom = gra.subgraph(largest_cc).copy()
    print(f"Dominantna komponenta ima {len(gra_dom.nodes())} čvorova i {len(gra_dom.edges())} grana")
    return gra_dom

#### Network Density

In [None]:

### 7) Kolika je gustina mreže?

# (Gephi) Graph Density: 0.012
print("Graph Density: ", nx.density(G))

#### Network Shortest Paths

In [None]:

### 8) Kolike su prosečne distance u okviru mreže i dijametar mreže?

# (Gephi) Average Path length: 2.098559911126496
# (Gephi) Network Diameter: 5 

Gdom = get_largest_component(G)
print(nx.diameter(Gdom)) # NetworkXError: Found infinite path length because the graph is not connected
print(nx.average_shortest_path_length(Gdom)) # NetworkXError: Graph is not connected.

In [None]:

### 9) U kojoj meri je mreža povezana i centralizovana? 
# Navesti broj i veličine povezanih komponenata i proceniti da li postoji gigantska komponenta. 

# Broj povezanih komponenata

# (Gephi) Number of Weakly Connected Components: 1486
print("Graph - is_connected: ", nx.is_connected(G))
print("Number of connected components: ", nx.number_connected_components(G))

# Veličine povezanih komponenata

print("Largest connected component: ", len(max(nx.connected_components(G), key=len)))
print("Sorted list of connected components, largest first:")
com_list = [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]
print(com_list)
# Postoji gigantska komponenta.

#### Network Clustering

In [None]:
### 10) Koliki je prosečni, a koliki globalni koeficijent klasterizacije mreže? 
# Kakva je raspodela lokalnog koeficijenta klasterizacije njenih čvorova? 
# Da li je klasterisanje izraženo ili ne? Odgovor dati upoređivanjem sa slučajno generisanom Erdos-Renyi mrežom istih dimenzija. 

In [None]:
# Global clustering coefficient

# The global clustering coefficient is based on triplets of nodes.
# A triplet is three nodes that are connected by either two (open triplet) or three (closed triplet) undirected ties.
# The global clustering coefficient is the number of closed triplets (or 3 x triangles) over the total number of triplets (both open and closed). 
# global_cc = (number of closed triplets) / (number of all triplets)

# Compute graph transitivity, the fraction of all possible triangles present in G.
# Possible triangles are identified by the number of "triads" (two edges with a shared vertex).
# The transitivity is:  T = 3*(triangles/triads)

tr = nx.transitivity(G)
print(f"Global clustering coefficient: {tr}")

In [None]:
# Average clustering coefficient

# (Gephi) Average Clustering Coefficient: 0.907
# (Gephi) Dominant Component Average Clustering Coefficient: 0.907
print("Average clustering coefficient: ", nx.average_clustering(G))
print("Average clustering coefficient dominant: ", nx.average_clustering(Gdom))

print("Average clustering coefficient: ", nx.average_clustering(G, count_zeros = False))
print("Average clustering coefficient dominant: ", nx.average_clustering(Gdom, count_zeros = False))

In [None]:
# Local clustering coefficient

subreddit_id, clustering_coef = zip(*nx.clustering(G, weight = "weight").items())

notZero = [(sub_id, cc)  for sub_id, cc in zip(subreddit_id, clustering_coef) if cc > 0]

df = pd.DataFrame(notZero, columns = ["id", "cc"])
df.sort_values('cc', inplace = True)

print("Max local clustering coefficient: ", max(clustering_coef))
print("Local clustering coefficients != 0 :") 
print(df)

# raspodela lokalnog koeficijenta klasterizacije ???



In [None]:
df.sort_values('cc', inplace=True, ascending=False)
y = df['cc'].to_list()
plt.xlabel("Број чворова")
plt.ylabel("Коефицијент кластеризације")
plt.plot(y)
plt.show()

###### Erdos-Renyi

In [None]:
n = G.number_of_nodes()
m = G.number_of_edges()
p = ( 2*float(m) ) / ( n* (n-1) )

er_network = nx.erdos_renyi_graph(n,p)

In [None]:
# Da li je klasterisanje izraženo ili ne?

print("Erdos-Renyi Average cc", nx.average_clustering(er_network, count_zeros = False))
print("Erdos-Renyi Global cc", nx.transitivity(er_network))

# A: Izrazenije nego u er mrezi: Da

#### Small World Network

In [None]:
### 11) Na osnovu odgovora na pitanja 8 i 10, proceniti da li mreža iskazuje osobine malog sveta.

# A small-world network is a type of mathematical graph in which most nodes are not neighbors of one another, but 
# the neighbors of any given node are likely to be neighbors of each other and most nodes can be reached 
# from every other node by a small number of hops or steps.

# A small world network is characterized by a small average shortest path length, and a large clustering coefficient
# Small-worldness is commonly measured with the coefficient sigma or omega.
# Both coefficients compare the average clustering coefficient and shortest path length of a given graph 
# against the same quantities for an equivalent random or lattice graph. (watts_strogatz_graph)

# if sigma > 1 network is small-world. 
#nx.sigma(G) # forever

# Erdos-Renyi network:
# nx.average_shortest_path_length(er_network) # result: 2.444373264140706
# Erdos-Renyi Average cc 0.012376659911008338

# ANSWER:
# (Gephi) Average Path length: 2.098559911126496
# Average clustering coefficient:  0.907

# Da. (Visok koef. klasterizacije i mala prosecna distanca)

#### Network Assortativity

In [None]:
### 12) Izvršiti asortativnu analizu po stepenu čvora i dati odgovor da li je izraženo asortativno mešanje. 
# U slučaju da je mreža usmerena, analizu izvršiti i po ulaznom i po izlaznom stepenu čvora. 
# Priložiti i vizuelizaciju. 

## asortativnost na osnovu netežinskog stepena čvora
print("Degree assortativity coefficient: ", nx.degree_assortativity_coefficient(G))
## asortativnost na osnovu težinskog stepena čvora
print("Degree assortativity coefficient (weight): ", nx.degree_assortativity_coefficient(G, weight='weight'))

# Assortativity measures the similarity of connections in the graph with respect to the node degree.
## Asortativna mreža => čvorovi sličnog stepena se vezuju međusobno

# Positive values of r indicate a correlation between nodes of similar degree, 
# while negative values indicate relationships between nodes of different degree. 
# In general, r lies between −1 and 1. 
# When r = 1, the network is said to have perfect assortative mixing patterns,
# when r = 0 the network is non-assortative,
# while at r = −1 the network is completely disassortative.

# ANSWER:
# Mreža je disasortativna.

#### Rich Club Phenomenon

In [None]:
### 13) Da li mreža ispoljava fenomen kluba bogatih (eng. rich club phenomenon)? 

# Rich-club can be viewed as a more specific notation of assortativity, where we are only concerned with the connectivity of nodes beyond a certain richness metric.
# The rich-club coefficient is a metric on graphs and networks, designed to measure the extent to which well-connected nodes also connect to each other. 

#nx.rich_club_coefficient(G) # forever
rcc = nx.rich_club_coefficient(G, normalized=False)
print(rcc)

plt.plot(rcc.values())
plt.show()

rcc = None
# ANSWER: Ne

#### Distribution Degree 

In [None]:
### 14) Kakva je distribucija čvorova po stepenu i da li prati power law raspodelu?

import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from operator import itemgetter

def plot_deg_frequency(G, weighted = False, xscale = "log", yscale = "log"):

    if weighted:
        degrees = G.degree(weight="weight")
    else:
        degrees = G.degree()
        
    _, deg_list = zip(*degrees)
    deg_counts = Counter(deg_list)        
    #print(deg_counts)
    x, y = zip(*deg_counts.items())                                                      

    plt.figure(1)   

    # prep axes   
    if weighted:
        plt.xlabel('weighted degree')  
    else:
        plt.xlabel('degree')                                                                                                             
    plt.xscale(xscale)                                                                                                                
    plt.xlim(1, max(x))  

    plt.ylabel('frequency')                                                                                                          
    plt.yscale(yscale)                                                                                                                
    plt.ylim(1, max(y))                                                                                                             
                                                                                                                                                                                                    
    plt.scatter(x, y, marker='.')                                                                                                    
    plt.show()
    

plot_deg_frequency(G) # Raspodele stepena čvora
plot_deg_frequency(G, weighted = True) # Raspodela težinskog stepena čvora

In [None]:
print("Erdos-Renyi Graph:")
plot_deg_frequency(er_network, xscale = 'linear', yscale = 'linear')
plot_deg_frequency(er_network, xscale = 'log', yscale = 'log')

print("Our Graph G:")
plot_deg_frequency(G, xscale = 'linear', yscale = 'linear')
plot_deg_frequency(G, xscale = 'log', yscale = 'log')

#### Fitting to the Power-Law Distribution

In [None]:
sum_of_weights = 0

for n in Gdom.edges.data():
    sum_of_weights += n[2]['weight']
    
print(sum_of_weights)

In [None]:
degree_sequence = sorted([d for n, d in Gdom.degree()], reverse=True)
degreeCount = Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())

max_deg = max(deg) # hoćemo po jedan bin za svaki stepen čvora

# izračunavanje histograma
values, base = np.histogram(deg, bins = max_deg)

# kumulativna suma (inkluzivna prefiksna suma)
cumulative = np.cumsum(values)

# plotovanje komplementarne kumulativne raspodele stepena čvora P(X>x)
plt.plot(base[:-1], [float(x)/sum_of_weights for x in sum_of_weights - cumulative], c='blue')

plt.show()

In [None]:
import powerlaw

In [None]:
results = powerlaw.Fit(degree_sequence)

print(results.supported_distributions)

In [None]:
print(results.power_law.alpha)
print(results.power_law.xmin)
print(results.power_law.sigma)
R, p = results.distribution_compare('power_law', 'exponential')
print(f"Loglikelihood ratio: {R}")
print(f"Statistical significance: {p}")
R, p = results.distribution_compare('power_law', 'truncated_power_law')
print(f"Loglikelihood ratio: {R}")
print(f"Statistical significance: {p}")
R, p = results.distribution_compare('exponential', 'truncated_power_law')
print(f"Loglikelihood ratio: {R}")
print(f"Statistical significance: {p}")

#### Hubs and authorities

In [None]:
### 15) Odrediti najvažnije habove i autoritete u mreži. 
# Kako su oni raspoređeni i ugrađeni u mrežu, da li su na periferiji ili u jezgru mreže?

# Hubs and authorities are a natural generalization of eigenvector centrality.
# A high hub actor points to many good authories and 
# a high authority actor receives from many good hubs. 

# Mreza je neusmerena -> nema habove i autoritete.

In [None]:
DiG = nx.DiGraph()

DiG.add_nodes_from([1, 2])
DiG.add_edge(1, 2)
DiG.add_edge(1, 2)
DiG.add_edge(2, 1)
DiG.edges[1, 2]['weight'] = 2;
DiG.edges[2, 1]['weight'] = 1;

nx.write_gml(DiG, "models/test.gml")