In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np

In [None]:
comments_path = 'ASM_PZ2_podaci_2122/reddit2008/comments_2008_asm/csv-{}.csv'
comments_list = []
for i in range(0, 12):
    comments_list.append(pd.read_csv(comments_path.format(i)))

comments_dtypes = {
    "id": object,
    "author": object,
    "link_id": object,
    "parent_id": object,
    "created_utc": int,
    "subreddit": object,
    "subreddit_id": object,
    "score": int,
    "distinguished": object,
    "gilded": int,
    "controversiality": int
}
#comments_data = comments_data.astype(comments_dtypes)
    
comments_data = pd.concat(comments_list)
comments_list = []
comments_data.reset_index(inplace = True, drop = True)

print(comments_data.shape)

In [None]:
submissions_path = 'ASM_PZ2_podaci_2122/reddit2008/submissions_2008_asm/csv-{}.csv'
submissions_list = []
for i in range(0, 12):
    submissions_list.append(pd.read_csv(submissions_path.format(i)))

submissions_dtypes = {
    "id": object,
    "url": object,
    "permalink": object,
    "author": object,
    "created_utc": int,
    "subreddit": object,
    "subreddit_id": object,
    "num_comments": int,
    "score": int,
    "over_18": bool,
    "distinguished": object,
    "domain": object,
    "stickied": bool,
    "locked": bool,
    "hide_score": bool
}
#submissions_data = submissions_data.astype(submissions_dtypes)
    
submissions_data = pd.concat(submissions_list)
submissions_list = []
submissions_data.reset_index(inplace = True, drop = True)

print(submissions_data.shape)

# Data Cleansing

In [None]:
print(comments_data['id'].isnull().values.any())
print(comments_data['id'].is_unique)

print(submissions_data['id'].isnull().values.any())
print(submissions_data['id'].is_unique)

comments_null_id = comments_data[comments_data['id'].isnull()]
print("\n", comments_null_id)

print("\n", comments_data.iloc[6422486:6422489, :])

comments_data.loc[comments_data['id'].isnull(), 'id'] = "nan"

print("\nAfter id null fix:\n", comments_data.iloc[6422486:6422489, :])

In [None]:
# Unnamed column COMMENTS
print(comments_data.columns)
print(comments_data["Unnamed: 0"])

comments_data = comments_data.drop(columns=["Unnamed: 0"])

print("\n", comments_data.columns)

# Unnamed column SUBMISSIONS
print("\n", submissions_data.columns)
print(submissions_data["Unnamed: 0"])

submissions_data = submissions_data.drop(columns=["Unnamed: 0"])

print("\n", submissions_data.columns)

In [None]:
comments_contr_not_zero = comments_data[comments_data['controversiality'] != 0]
print("\n", comments_contr_not_zero)

In [None]:
# check for null columns - COMMENTS
nan_values = comments_data.isna()
nan_columns = nan_values.any()
columns_with_nan = comments_data.columns[nan_columns].tolist()
print(columns_with_nan)

In [None]:
comments_dis_not_null = comments_data[comments_data['distinguished'].isnull() == False]
print("\n", comments_dis_not_null.shape)

In [None]:
# check for null columns - SUBMISSIONS
nan_values = submissions_data.isna()
nan_columns = nan_values.any()
columns_with_nan = submissions_data.columns[nan_columns].tolist()
print(columns_with_nan)

In [None]:
submissions_dis_not_null = submissions_data[submissions_data['distinguished'].isnull() == False]
print("\n", submissions_dis_not_null.shape)

submissions_domain_not_null = submissions_data[submissions_data['domain'].isnull() == True]
print("\n", submissions_domain_not_null.shape)
#print("\n", submissions_domain_not_null)

In [None]:
# Data types check
print(comments_data.dtypes, "\n")
print(submissions_data.dtypes)

# Statistička obrada podataka (3.4.1.)

In [None]:

# 1) Koliko postoji različitih sabredita koji se pojavljuju u posmatranom periodu? Koji su najvažniji po broju korisnika, a koji po broju komentara? 

submissions_subreddit_columns = submissions_data.loc[:,'subreddit' : 'subreddit_id']
submissions_subreddit_columns = submissions_subreddit_columns.drop_duplicates()
print("SUBMISSIONS Subreddits:\n", submissions_subreddit_columns.shape, "\n")

comments_subreddit_columns = comments_data.loc[:, 'subreddit' : 'subreddit_id']
comments_subreddit_columns = comments_subreddit_columns.drop_duplicates()
print("COMMENTS Subreddits:\n", comments_subreddit_columns.shape, "\n")

subreddit_columns = pd.concat([submissions_subreddit_columns, comments_subreddit_columns])
print("ALL Subreddits:\n", subreddit_columns.shape, "\n")

subreddit_columns = subreddit_columns.drop_duplicates()
print("UNIQUE Subreddit pairs:\n", subreddit_columns.shape, "\n")

#---------------------------------------------------------------------------------------------

# check for nonuniqueness (subreddits, subreddit IDs)
print("\nCHECK FOR NONUNIQUENESS\n")
print("submissions_data subreddit_id - is unique: ", submissions_subreddit_columns['subreddit_id'].is_unique)
print("comments_data subreddit_id - is unique: ", comments_subreddit_columns['subreddit_id'].is_unique)
print("Column subreddit - is unique: ", subreddit_columns['subreddit'].is_unique)
print("Column subreddit_id - is unique: ", subreddit_columns['subreddit_id'].is_unique, "\n")

data_grouped = subreddit_columns[['subreddit', 'subreddit_id']].groupby('subreddit_id')

data_aggregated = data_grouped['subreddit_id'].agg(np.size)
#print(data_aggregated, "\n")

data_nonunique = data_aggregated[data_aggregated > 1]
print(data_nonunique, "\n")

target_match_list = data_nonunique.keys()[:]
#print("Nonunique IDs:", target_match_list, "\n")

data_target = subreddit_columns[subreddit_columns['subreddit_id'].isin(target_match_list)]
print(data_target, "\n")

print("COMMENTS data rows with target_match IDs:\n")
comments_target_rows = comments_data[comments_data['subreddit_id'].isin(target_match_list)]
comments_target_rows = comments_target_rows.drop_duplicates('subreddit_id')
print(comments_target_rows, "\n")

print("SUBMISSIONS data rows with target_match IDs:\n")
submissions_target_rows = submissions_data[submissions_data['subreddit_id'].isin(target_match_list)]
submissions_target_rows = submissions_target_rows.drop_duplicates('subreddit_id')
print(submissions_target_rows)


In [None]:
print("Before:\n", subreddit_columns, "\n")
unique_subreddits_1 = subreddit_columns[~((subreddit_columns['subreddit'] == '_Descary') & (subreddit_columns['subreddit_id'] == 't5_2qj0s'))]
unique_subreddits = unique_subreddits_1[~((unique_subreddits_1['subreddit'] == '__Journalism') & (unique_subreddits_1['subreddit_id'] == 't5_2qhyl'))]
print("After:\n", unique_subreddits)

# Q: Koliko postoji različitih sabredita koji se pojavljuju u posmatranom periodu? 
# ANSWER: 5032

# Changing subreddit (with same IDs) names from _Name to Name ##### TREBALO BI PROMENITI I permalink
submissions_data.loc[submissions_data['subreddit_id'] == 't5_2qj0s', 'subreddit'] = 'Descary'
submissions_data.loc[submissions_data['subreddit_id'] == 't5_2qhyl', 'subreddit'] = 'Journalism'

submissions_rows = submissions_data[submissions_data['subreddit_id'].isin(target_match_list)]
submissions_rows = submissions_rows.drop_duplicates('subreddit_id')
print(submissions_rows)

In [None]:
print("SUBREDDIT WITH MAX NUMBER OF COMMENTS:\n")
submissions_num_comments = submissions_data.groupby(["subreddit","subreddit_id"]).num_comments.sum().reset_index()
print("Subreddits num_comments:\n", submissions_num_comments, "\n")

max_num_comments = submissions_num_comments['num_comments'].idxmax()
max_num_comments_row = submissions_num_comments.iloc[max_num_comments]
print("Subreddits with MAX num of comments:\n", max_num_comments_row, "\n")

max_num_comments_10_rows = submissions_num_comments.nlargest(10, 'num_comments')
print("First 10 subreddits with MAX num of comments:\n", max_num_comments_10_rows, "\n")

# Q: Koji su najvažniji po broju korisnika, a koji po broju komentara?
# A: Po broju komentara: reddit.com (1768764), politics(1059618), programming(421137), ...


In [None]:
print("SUBREDDIT WITH MAX NUMBER OF USERS:\n")
submissions_authors = submissions_data[['subreddit', 'subreddit_id', 'author']]
print("Subreddits authors:\n", submissions_authors, "\n")

submissions_authors_unique = submissions_authors.drop_duplicates()
print("Subreddits authors without duplicates:\n", submissions_authors_unique, "\n")

submissions_num_authors = submissions_authors_unique.groupby(["subreddit","subreddit_id"]).author.count().reset_index()
print("Subreddit num_authors:\n", submissions_num_authors, "\n")

max_num_authors = submissions_num_authors['author'].idxmax()
max_num_authors_row = submissions_num_authors.iloc[max_num_authors]
print("Subreddits with MAX num of authors:\n", max_num_authors_row, "\n")

max_num_authors_10_rows = submissions_num_authors.nlargest(5, 'author')
print("First 10 subreddits with MAX num of authors:\n", max_num_authors_10_rows, "\n")

# sta raditi sa celijama gde je author==[deleted] ??

# Q: Koji  su najvažniji po broju korisnika, a koji po broju komentara?
# A: Po broju korisnika: reddit.com (138153), politics(15250), business(13009), ...

In [None]:
# 2) Kakav je prosečan broj zabeleženih korisnika aktivnih u posmatranom periodu po sabreditu? 
# Korisnik se smatra aktivnim na sabreditu ako je zabeležen barem jedan komentar ili objava tog korisnika.

print("AUTHORS PER SUBMISSION:\n")
submissions_authors = submissions_data[['subreddit', 'subreddit_id', 'author']]
print("Submissions authors:\n", submissions_authors, "\n")

submissions_authors_unique = submissions_authors.drop_duplicates()
print("Submissions authors without duplicates:\n", submissions_authors_unique, "\n")

print("AUTHORS PER COMMENT:\n")
comments_authors = comments_data[['subreddit', 'subreddit_id', 'author']]
print("Comments authors:\n", comments_authors, "\n")

comments_authors_unique = comments_authors.drop_duplicates()
print("Comments authors without duplicates:\n", comments_authors_unique, "\n")

print("AUTHORS PER COMMENT AND PER SUBMISSION:")
subreddit_authors = pd.concat([comments_authors_unique, submissions_authors_unique])
print(subreddit_authors.shape, "\n")

subreddit_authors_unique = subreddit_authors.drop_duplicates()
print("Subreddit authors without duplicates:\n", subreddit_authors_unique, "\n")

subreddit_num_authors = subreddit_authors_unique.groupby(["subreddit","subreddit_id"]).author.count().reset_index()
print("Subreddit num_authors:\n", subreddit_num_authors, "\n")

subreddit_sum_authors = subreddit_num_authors['author'].sum()
subreddit_authors_num_rows = subreddit_num_authors.shape[0]
subreddit_authors_avg = round(subreddit_sum_authors/subreddit_authors_num_rows)
print("Average number of authors per subreddit:\n", subreddit_authors_avg)

# Q: Kakav je prosečan broj zabeleženih korisnika aktivnih u posmatranom periodu po sabreditu?
# A: 129

In [None]:
# 3) Ko su korisnici sa najvećim brojem objava, a ko korisnici sa najvećim brojem komentara? 

print("MAX SUBMISSIONS USERS:\n")
max_submissions_users = submissions_data.author.value_counts()[:6]
print("With deleted:\n", max_submissions_users)
max_submissions_users_not_del = max_submissions_users.tail(max_submissions_users.shape[0] - 1)
print("\nUsers with max_submissions:\n", max_submissions_users_not_del)

# Q: Ko su korisnici sa najvećim brojem objava, a ko korisnici sa najvećim brojem komentara? 
# A: Korisnici sa najvećim brojem objava: gst(18870), qgyh2(12238), ...

print("\nMAX COMMENTS USERS:\n")
max_comments_users = comments_data.author.value_counts()[:6]
print("With deleted:\n", max_comments_users)
max_comments_users_not_del = max_comments_users.tail(max_comments_users.shape[0] - 1)
print("\nUsers with max_comments:\n", max_comments_users_not_del)

# Q: Ko su korisnici sa najvećim brojem objava, a ko korisnici sa najvećim brojem komentara? 
# A: Korisnici sa najvećim brojem komentara: NoMoreNicksLeft(13480), malcontent(12159), ...

In [None]:
# 4) Koji korisnici su aktivni na najvećem broju sabredita? Na koliko su sabredita aktivni?

print("SUBMISSION AUTHORS PER SUBREDDIT:\n")
submissions_authors_4 = submissions_data[['subreddit', 'subreddit_id', 'author']]
print("Submissions authors:\n", submissions_authors_4.shape, "\n")

submissions_authors_unique_4 = submissions_authors_4.drop_duplicates()
print("Submissions authors without duplicates:\n", submissions_authors_unique_4, "\n")

print("COMMENT AUTHORS PER SUBREDDIT:\n")
comments_authors_4 = comments_data[['subreddit', 'subreddit_id', 'author']]
print("Comments authors:\n", comments_authors_4.shape, "\n")

comments_authors_unique_4 = comments_authors_4.drop_duplicates()
print("Comments authors without duplicates:\n", comments_authors_unique_4, "\n")

print("COMMENT AND SUBMISSION AUTHORS PER SUBREDDIT:")
subreddit_authors_4 = pd.concat([comments_authors_unique_4, submissions_authors_unique_4])
print(subreddit_authors_4.shape, "\n")

print("MAX SUBREDDIT USERS:\n")
max_subreddit_users_4 = subreddit_authors_4.author.value_counts()[:6]
print("With deleted:\n", max_subreddit_users_4, "\n")
max_subreddit_users_not_del_4 = max_subreddit_users_4.tail(max_subreddit_users_4.shape[0] - 1)
print("Users active on max_subreddits:\n", max_subreddit_users_not_del_4)

# Q: Koji korisnici su aktivni na najvećem broju sabredita? Na koliko su sabredita aktivni?
# A: MrKlaatu(181), Escafane(154), ...

In [None]:
# 5) Kako su korelisani brojevi objava i brojevi komentara korisnika? Odrediti Pirsonov koeficijent korelacije i izvršiti vizuelizaciju.

#

In [None]:
# 6) Koje objave poseduju najveći broj komentara i na kojim su sabreditima postavljene? 
# Prikazati podatke o tim objavama, uključujući to na kojem su sabreditu postavljene 
# i šta im je sadržaj (ako je polje objave “over 18” postavljeno na false).

submissions_sorted_num_comments = submissions_data.sort_values(by='num_comments', ascending=False)
submissions_sorted_num_comments = submissions_sorted_num_comments[submissions_sorted_num_comments['over_18'] == False]

submissions_sorted_num_comments_first10 = submissions_sorted_num_comments[:10] 
print("First 10 submissions sorted by number of comments:\n", submissions_sorted_num_comments_first10, "\n")

# Q: Koje objave poseduju najveći broj komentara i na kojim su sabreditima postavljene? 
# A: submission_id(subreddit, num_comments) = 6nz1k(science, 33329), 78n1v(WTF, 3657), ...

### Modelovanje mreže

In [None]:
import networkx as nx

In [None]:
G = nx.Graph()
G.add_nodes_from(set(unique_subreddits['subreddit_id']))

In [None]:
sub_subreddit_id_author = submissions_data[['subreddit_id', 'author']]
com_subreddit_id_author = comments_data[['subreddit_id', 'author']]

print("Listing subreddits and authors:\n")
print(sub_subreddit_id_author, '\n')
print(com_subreddit_id_author, '\n')

sub_subreddit_id_author = sub_subreddit_id_author[sub_subreddit_id_author['author'] != '[deleted]']
com_subreddit_id_author = com_subreddit_id_author[com_subreddit_id_author['author'] != '[deleted]']

print("After removed '[deleted]':\n")
print(sub_subreddit_id_author, '\n')
print(com_subreddit_id_author, '\n')

sub_subreddit_id_author = sub_subreddit_id_author.drop_duplicates()
com_subreddit_id_author = com_subreddit_id_author.drop_duplicates()

print("After dropped duplicates:\n")
print(sub_subreddit_id_author, '\n')
print(com_subreddit_id_author, '\n')

subreddit_id_author = pd.concat([sub_subreddit_id_author, com_subreddit_id_author])
subreddit_id_author = subreddit_id_author.drop_duplicates()
subreddit_id_author.reset_index(inplace = True, drop = True)

print("After concationation and dropped duplicates:\n")
print(subreddit_id_author, '\n')

In [None]:
grouped_by_author = subreddit_id_author.groupby('author')

for _, g in grouped_by_author:
    sub_list = list(g.loc[:, 'subreddit_id'])
    
    if len(sub_list) > 1:
        for i in range(len(sub_list) - 1):
            for j in range(i + 1, len(sub_list)):
                if (sub_list[i], sub_list[j]) in G.edges:
                    G.edges[sub_list[i], sub_list[j]]['weight'] += 1
                else:
                    G.add_edge(sub_list[i], sub_list[j], weight=1)
                    

In [None]:
output_path = "models/our_model.gml"

nx.write_gml(G, output_path)

# Osnovna karakterizacija modelovanih mreža (3.4.2.)

In [None]:

print(nx.info(G))

# Gephi

### 7) Kolika je gustina mreže?

# (Gephi) Graph Density: 0.012

### 8) Kolike su prosečne distance u okviru mreže i dijametar mreže?

# (Gephi) Average Path length: 2.098559911126496
# (Gephi) Network Diameter: 5 


In [None]:

### 9) U kojoj meri je mreža povezana i centralizovana? 
# Navesti broj i veličine povezanih komponenata i proceniti da li postoji gigantska komponenta. 

# (Gephi) Number of Weakly Connected Components: 1486
# (Gephi) Size -> 0                        - ?
# (Gephi) Ne postoji gigantska komponenta. - ?


In [None]:
# Raspodele stepena čvora

import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from operator import itemgetter

def plot_deg_frequency(G, weighted = False, xscale = "log", yscale = "log"):

    if weighted:
        degrees = G.degree(weight="weight")
    else:
        degrees = G.degree()
        
    _, deg_list = zip(*degrees)
    deg_counts = Counter(deg_list)        
    print(deg_counts)
    x, y = zip(*deg_counts.items())                                                      

    plt.figure(1)   

    # prep axes   
    if weighted:
        plt.xlabel('weighted degree')  
    else:
        plt.xlabel('degree')                                                                                                             
    plt.xscale(xscale)                                                                                                                
    plt.xlim(1, max(x))  

    plt.ylabel('frequency')                                                                                                          
    plt.yscale(yscale)                                                                                                                
    plt.ylim(1, max(y))                                                                                                             
                                                                                                                                                                                                    
    plt.scatter(x, y, marker='.')                                                                                                    
    plt.show()

In [None]:
plot_deg_frequency(G)

#### Raspodela težinskog stepena čvora

In [None]:
plot_deg_frequency(G, weighted = True)

In [None]:
### 10) Koliki je prosečni, a koliki globalni koeficijent klasterizacije mreže? 
# Kakva je raspodela lokalnog koeficijenta klasterizacije njenih čvorova? 
# Da li je klasterisanje izraženo ili ne? Odgovor dati upoređivanjem sa slučajno generisanom Erdos-Renyi mrežom istih dimenzija. 

# (Gephi) Average Clustering Coefficient: 0.907
# globalni koeficijent klasterizacije mreže ???
# raspodela lokalnog koeficijenta klasterizacije ???


In [None]:
# Da li je klasterisanje izraženo ili ne?

# Mere centralnosti

In [None]:
import networkx as nx

G = nx.Graph(nx.read_gml("models/our_model.gml"))

In [None]:
component_size_list = [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]
print(component_size_list)

In [None]:
largest_cc = max(nx.connected_components(G), key=len)
S = [G.subgraph(c).copy() for c in nx.connected_components(G)]
Gdom = G.subgraph(largest_cc).copy()
print(f"Dominantna komponenta ima {len(Gdom.nodes())} čvorova i {len(Gdom.edges())} grana")
nx.write_gml(Gdom, "models/gdom_model.gml")

In [None]:
def measure_centrality(measure_name, graph):
    if measure_name == 'DC':
        cm_dict = nx.degree_centrality(graph)
    if measure_name == 'CC':
        cm_dict = nx.closeness_centrality(graph)
    if measure_name == 'BC':
        cm_dict = nx.betweenness_centrality(graph)
    if measure_name == 'EC':
        cm_dict = nx.eigenvector_centrality(graph, weight='weight')

    df_cm = pd.DataFrame.from_dict(cm_dict, orient='index', columns=[measure_name])
    df_cm = pd.DataFrame({measure_name:df_cm[measure_name]})
    df_cm.sort_values(by=measure_name, ascending=False, inplace = True)

    print(df_cm.head(10))

In [None]:
measure_centrality('DC', Gdom)
measure_centrality('CC', Gdom)
measure_centrality('BC', Gdom)
measure_centrality('EC', Gdom)

In [None]:
degrees = sorted(Gdom.degree(weight='weight'), key=lambda x:x[1], reverse = True)
df_degree = pd.DataFrame.from_dict(dict(degrees), orient='index', columns=['degree'])
print(df_degree.head(10))

In [None]:
lambda_max =  max(nx.adjacency_spectrum(Gdom))
print(lambda_max)
print(1/lambda_max)

In [None]:
alpha_values = [0.000005, 0.000004, 0.000003, 0.000002, 0.000001]

for a in alpha_values:
    KC_dict = nx.katz_centrality(Gdom, alpha=a, beta=1.0, max_iter=1000, tol=1e-06, nstart=None, normalized=True, weight='weight')
    df_katzc = pd.DataFrame.from_dict(KC_dict, orient='index', columns=['KC'])
    df_katzc.sort_values(by='KC', ascending=False, inplace = True)
    print(df_katzc.head(10))

In [None]:
nodes_list = list(Gdom.nodes)
beta_values = [1] * len(nodes_list)

vip_subreddit = unique_subreddits[unique_subreddits['subreddit'] == 'reddit.com']
vip_index = nodes_list.index(vip_subreddit['subreddit_id'].item())
beta_values[vip_index] = 10

beta_dict = dict(zip(nodes_list, beta_values))

for a in alpha_values:
    KC_dict = nx.katz_centrality(Gdom, alpha=a, beta=beta_dict, max_iter=1000, tol=1e-06, nstart=None, normalized=True, weight='weight')
    df_katzc = pd.DataFrame.from_dict(KC_dict, orient='index', columns=['KC'])
    df_katzc.sort_values(by='KC', ascending=False, inplace = True)
    print(df_katzc.head(10))

# Komune

In [None]:
from scipy import linalg
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import SpectralClustering

In [None]:
def plot_scatterplot(x_data, y_data, x_label, y_label, title):
    fig, ax = plt.subplots()
    ax.scatter(x_data, y_data) 
    ax.set_ylabel(y_label, fontsize=15)
    ax.set_xlabel(x_label, fontsize=15)
    ax.set_title(title)
    plt.show() 

In [None]:
L_dom = nx.laplacian_matrix(Gdom).toarray()

eigenvalues = linalg.eigvals(L_dom) # TODO: change to .eigenvalsh because we know the matrix is symmetric
eigenvalues.sort()
enumerator = np.array(range(1, len(eigenvalues)+1))
df_eig = pd.DataFrame(list(zip(enumerator, eigenvalues)))

# write eigenvalue table
df_eig30 = df_eig[:60]
df_eig30.columns = ['k', 'lambda_k']
df_eig30 = df_eig30.astype({'k': 'int32', 'lambda_k':'float'})
print(df_eig30)

plot_scatterplot(enumerator, eigenvalues, r'$k$', r'$\lambda_k$', 'Ceo spektar graf laplasijana dominantne komponente')

df_eig_30 = df_eig[:30]
plot_scatterplot(df_eig_30.iloc[:,0], df_eig_30.iloc[:,1], r'$k$', r'$\lambda_k$', 'Prvih 30 sopstvenih vrednosti graf laplasijana dom. komponente')

In [None]:
points_of_interest = [2, 3, 5, 8, 12, 14, 17]

for k in points_of_interest:
    
    clustering = SpectralClustering(n_clusters=k,
    assign_labels="discretize", affinity="precomputed").fit(nx.adjacency_matrix(Gdom))

    colors = clustering.labels_
    c_string = []
    for c in colors:
        c_string.append(str(c))

    G = nx.Graph()
    for c, label in zip(c_string, Gdom.nodes()):
        G.add_node(label, color=c)

    for edge in Gdom.edges(data=True):
        #print(edge)
        G.add_edge(edge[0], edge[1], weight=edge[2]['weight'])

    # nx.write_pajek(G, "etf/spectral3.net")
    nx.write_gml(G, f"models/spectral{k}.gml")
    
    csizes = np.zeros(shape=k, dtype=int)
    for c in colors:
        csizes[int(c)] += 1
    
    print(f"Podela na {k}: velicine komponenata su {csizes}")

In [None]:
k = 2
Gcom = nx.Graph(nx.read_gml(f"models/spectral{k}.gml"))

In [None]:
com_ids = [None] * k

for i in range(k):
    com_ids[i] = []

for n, prop in Gcom.nodes(data=True):
    com_ids[int(prop['color'])].append(n)
    
for l in com_ids:
    print("\n")
    print(l)

In [None]:
color_subred = unique_subreddits[unique_subreddits['subreddit_id'].isin(com_ids[1])]
print(color_subred)

In [None]:
DiG = nx.DiGraph()

DiG.add_nodes_from([1, 2])
DiG.add_edge(1, 2)
DiG.add_edge(1, 2)
DiG.add_edge(2, 1)
DiG.edges[1, 2]['weight'] = 2;

nx.write_gml(DiG, "models/test.gml")