In [63]:
import json
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from urllib.parse import urlparse
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations


# Config.
file_path = "../data/1eb80fb8b50.json"

segment_key = ["kws-l", "loc-l", "org-l", "per-l"]
Years = ["2024"]

columns_to_use = ["loc-l", "org-l", "per-l"]
ignore_words = ["Telegram", "TikTok", "Sputnik Africa"]

display_graph = True
save_graph = True
name_graph_saved = "graph_0"

colors = ['#1f78b4',  # Blue
            '#33a02c',  # Green
            '#e31a1c',  # Red
            '#ff7f00',  # Orange
            '#6a3d9a']  # Purple

In [64]:
# Get open data.
try:
    with open(file_path, "r") as json_file:
        data = json.load(json_file)
except FileNotFoundError:
    print(f"File not found: {file_path}")
except json.JSONDecodeError as e:
    print(f"JSON decoding error: {e}")

In [65]:
data["data"]["2024"]["2"]["29"][0]["loc-l"]

[[[]],
 [['Musawenkosi Mdluli']],
 [[], [], ['Mdluli']],
 [[]],
 [['africa']],
 [['Sputnik Africa']]]

In [56]:
# Get the data from the json.
df_kws_article = pd.DataFrame(columns=["year", "month", "day", "url", "title"] + segment_key)

for year in Years:
    for month in range(1, 13):
        for day in range(1, 32):
            num = 0
            while True:
                try:
                    data_tmp = data['data'][str(year)][str(month)][str(day)][num]
                    num += 1

                    base_data = {
                        "year": year,
                        "month": month,
                        "day": day,
                        "url": data_tmp.get("url", ""),
                        "title": data_tmp.get("title", "")
                    }

                    # Create a dictionary to hold the segment data
                    dict_tmp = {key: [] for key in segment_key}

                    # Extract segment data for each key
                    for key in segment_key:
                        if key in data_tmp:
                            for segment_data_tmp in data_tmp[key]:
                                for segment_data in segment_data_tmp:
                                    dict_tmp[key].append(segment_data)  # Store all segment entries in lists
                        else:
                            dict_tmp[key].append(None)  # Handle missing keys by appending None

                    # Find the maximum length of segment data (to know how many entries to create)
                    max_length = max([len(value) for value in dict_tmp.values()])

                    # Ensure each key has the same number of entries (fill shorter lists with None)
                    for key in dict_tmp.keys():
                        while len(dict_tmp[key]) < max_length:
                            dict_tmp[key].append(None)

                    # Add rows to the DataFrame based on the number of segment data entries
                    for i in range(max_length):
                        new_data = base_data.copy()
                        for key in segment_key:
                            new_data[key] = dict_tmp[key][i]  # Assign segment data for this row
                        df_kws_article = pd.concat([df_kws_article, pd.DataFrame([new_data])], ignore_index=True)

                except KeyError:
                    break
                except Exception as e:
                    print(f"Error processing data: {e}")
                    break

# Output the shape of the resulting dataframe
df_kws_article.shape


Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range
Error processing data: list index out of range


(3323, 9)

In [66]:
df_kws_article.head(10)

Unnamed: 0,year,month,day,url,title,kws-l,loc-l,org-l,per-l
0,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...","[monopolize, african, expert, financial, syste...",[],[BRICS],[]
1,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...","[world, banking, multinational, company, monop...",[Musawenkosi Mdluli],[World BRICS South Africa],[Sputnik Africa]
2,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...","[world, banking, multinational, company, monop...",[],[],[]
3,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...",[control],[],[],[]
4,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...","[multipolar, system, affect, change, financial...",[Mdluli],[],[]
5,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...","[read, story]",[],[],[]
6,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...",[subscribe],[africa],[],[]
7,2024,2,29,Sputnik Africa-14278,"""They're monopolizing everything"": African exp...","[tiktok, sputnik, africa, boost, telegram]",[Sputnik Africa],"[TikTok, Telegram]",[]
8,2024,3,1,Sputnik Africa-14279,🇲🇿 Mozambique's president gets immunity by Lon...,"[mozambique, president, get, immunity, london,...","[Mozambique, London]",[],[]
9,2024,3,1,Sputnik Africa-14279,🇲🇿 Mozambique's president gets immunity by Lon...,"[ruling, read, president, filipe, nyusi, immun...",[Mozambique],[],[Filipe Nyusi]


In [67]:
# Create a dictionary to map words to their corresponding colors
word_color_map = {}

list_words = []

# Iterate over rows to keep words from the same row grouped together
for i, row in df_kws_article.iterrows():
    row_words = []
    
    for col_idx, val in enumerate(columns_to_use):
        words_in_column = row[val]

        if isinstance(words_in_column, list) and words_in_column:
            row_words += words_in_column

            for word in words_in_column:
                word_color_map[word] = colors[col_idx % len(colors)]

    if row_words:
        list_words.append(row_words)

list_words

[['BRICS'],
 ['Musawenkosi Mdluli', 'World BRICS South Africa', 'Sputnik Africa'],
 ['Mdluli'],
 ['africa'],
 ['Sputnik Africa', 'TikTok', 'Telegram'],
 ['Mozambique', 'London'],
 ['Mozambique', 'Filipe Nyusi'],
 ['Privinvest', 'Credit Suisse', 'Privinvest'],
 ['Privinvest', 'FRELIMO', 'Nyusi'],
 ['Privinvest', 'Nyusi'],
 ['africa'],
 ['Sputnik Africa', 'TikTok', 'Telegram'],
 ['Russia', 'US'],
 ['Russia', 'US Strategic Command', 'USSTRATCOM', 'Anthony Cotton'],
 ['Russia', 'Cotton'],
 ['Tsirkon', 'Russia', 'Ukraine', 'Kinzhal'],
 ['africa'],
 ['Sputnik Africa', 'TikTok', 'Telegram'],
 ['Belgorod', 'Nizhny'],
 ['US', 'United State', 'Joe Biden', 'Donald Trump'],
 ['US', 'Trump', 'Biden'],
 ['Biden', 'Trump'],
 ['Russia',
  'Russia',
  'Ukraine',
  'Belgorod',
  'Kiev',
  'Il 76',
  'Tatyana Moskalkova'],
 ['Japan', 'Russia', 'Ukraine'],
 ['Russia', 'Turkey', 'Antalya Diplomatic Forum', 'Sergey Lavrov'],
 ['Eswatini',
  'Azerbaijan',
  'Kyrgyzstan',
  'Uzbekistan',
  'Hakan Fidan',
  'K

In [59]:
list_words

[['BRICS'],
 ['World BRICS South Africa'],
 [],
 [],
 [],
 [],
 [],
 ['TikTok', 'Telegram'],
 [],
 [],
 ['Privinvest', 'Credit Suisse', 'Privinvest'],
 ['Privinvest', 'FRELIMO'],
 ['Privinvest'],
 [],
 ['TikTok', 'Telegram'],
 [],
 ['US Strategic Command', 'USSTRATCOM'],
 ['Cotton'],
 ['Kinzhal'],
 [],
 ['TikTok', 'Telegram'],
 [],
 [],
 [],
 [],
 [],
 ['Il 76'],
 [],
 ['Antalya Diplomatic Forum'],
 [],
 [],
 [],
 [],
 ['TikTok', 'Telegram'],
 [],
 [],
 [],
 [],
 [],
 ['TikTok', 'Telegram'],
 [],
 [],
 ['NATO'],
 [],
 [],
 ['TikTok', 'Telegram'],
 [],
 [],
 ['TikTok', 'Telegram'],
 [],
 [],
 [],
 [],
 [],
 ['TikTok', 'Telegram'],
 [],
 ['Siriu'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['TikTok', 'Telegram'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Ministry of Defense'],
 ['MoD'],
 [],
 [],
 ['Avdeyevka'],
 [],
 [],
 [],
 [],
 ['TikTok', 'Telegram'],
 ['Russian Security Council'],
 [],
 ['TikTok', 'Telegram'],
 ['Bundeswehr'],
 [],
 ['TikTok', 'Telegram'],
 ['Bundeswehr Air Force']

In [None]:
# Generate the graph
G = nx.Graph()

for word_list in list_words:
    words = [word for word in word_list if word not in ignore_words]
    
    for word1, word2 in combinations(words, 2):
        if G.has_edge(word1, word2):
            G[word1][word2]['weight'] += 1
        else:
            G.add_edge(word1, word2, weight=1)

# Set node colors as attributes (ensure colors are in hex format)
nx.set_node_attributes(G, {word: {'color': color} for word, color in word_color_map.items()})

# Draw the graph
if display_graph:
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G)

    node_colors = [word_color_map.get(node, '#d3d3d3') for node in G.nodes()]

    # Draw nodes with colors
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=700)
    edges = G.edges(data=True)
    nx.draw_networkx_edges(G, pos, edgelist=edges, width=[d['weight'] for (u, v, d) in edges], alpha=0.5)
    nx.draw_networkx_labels(G, pos, font_size=12)

    plt.title("Word Co-occurrence Graph")
    plt.axis('off')
    plt.show()

# Save the graph with node colors in GEXF format
if save_graph:
    nx.write_gexf(G, name_graph_saved + ".gexf")