In [53]:
import os
from google.cloud import bigquery
import pandas as pd
# import matplotlib
# matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, coo_matrix

from src.util.preprocess import get_connectivity
from src.util.visualization import visualize_graph

In [54]:
%matplotlib inline
import mpld3
mpld3.enable_notebook()
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str('../access/apikey.json')


In [55]:
import PyQt6

In [56]:
client = bigquery.Client()

QUERY = ("SELECT GKGRECORDID, V2Persons FROM `gdelt-bq.gdeltv2.gkg` WHERE DATE>20230701000000 and DATE < 20230715000000 and V2Persons like '%Albanese%' and V2Locations like '%Australia%' limit 250;")

In [None]:
data_frame = pd.read_gbq(QUERY)

In [None]:
data_frame.head()

In [None]:
connection_database = (get_connectivity(data_frame))


In [None]:
connection_database.to_csv('../data/processed/connection_dataset.csv')
edge_counts = connection_database.groupby(['Source', 'Target']).size().reset_index().rename(columns = {0:'count'})
edge_counts.to_csv('../data/processed/edge_counts.csv', index_label='ix')
edge_counts.head()

In [None]:
weights = edge_counts['count'].values.astype(float)
weights /= weights.max()

edge_counts.insert(3, 'weight', weights)
edge_counts.to_csv('processed_edges.csv', index=None)

In [None]:
edge_counts = pd.read_csv('../data/processed/processed_edges.csv')
plt.hist(edge_counts['count'].values, bins=range(min(edge_counts['count'].values), max(edge_counts['count'].values) + 5, 5))
plt.title('co-occurence of the name pairs')
plt.show()


data = edge_counts.drop(edge_counts[edge_counts['count']<5].index)
plt.hist(data['count'].values, bins=range(min(data['count'].values), max(data['count'].values) + 5, 5))
plt.title('co-occurence of the name pairs; pruned')
plt.show()

In [None]:
# sum_outgoing_edges = edge_counts.groupby('Target')['count'].transform('sum')
# sum_outgoing_edges

In [None]:
# df_sorted = edge_counts.groupby('Source').apply(lambda x: x.sort_values('count', ascending=False))
# df_sorted

In [None]:
# df_sorted = edge_counts.sort_values(by=['Target', 'weight'], ascending=[True, False])
#
# # Take the top 5 rows for each 'source' group
# df_filtered = df_sorted.groupby('Target').head(5).reset_index(drop=True)

In [None]:
le = LabelEncoder()
le.fit(np.union1d(data.Source.values, data.Target.values))


In [None]:
sources = le.transform(data.Source.values)
targets = le.transform(data.Target.values)

In [None]:
adj_mat = csr_matrix(coo_matrix( (data.weight.values, (sources, targets))))
adj_mat += adj_mat.T
adj_mat /= 2.
# adj_mat[adj_mat < adj_mat.max()*0.5] = 0
print(adj_mat.shape)

In [None]:
visualize_graph(adj_mat, np.union1d(data.Source.values, data.Target.values), le.transform(np.union1d(data.Source.values, data.Target.values)))



Let's sparsify the graph a bit more so that we can see clearer

In [None]:
adj_mat = adj_mat.toarray()

In [None]:
for i in range(adj_mat.shape[0]):
    row = adj_mat[i]
    sorted_indices = np.argsort(row)
    cutoff = np.argsort(row)[2]
    # print(cutoff)
    # print(adj_mat[i])
    adj_mat[i][adj_mat[i]>cutoff] = 0
    adj_mat[:, i][adj_mat[i]>cutoff]=0
    # print(adj_mat[i])

In [None]:
adj_mat = csr_matrix(adj_mat)

In [None]:
visualize_graph(adj_mat, np.union1d(data.Source.values, data.Target.values), le.transform(np.union1d(data.Source.values, data.Target.values)))