This file contains the work process we performed in order to create the data to be used with NetoworkX & Gephi

In [4]:
import pandas as pd
import sys
sys.path.insert(0, '../') # Needed to import the DATES_STRINGS constant from a Python file.
from init_data import DATES_STRINGS

Creating the base DF to use in the creation of the nodes & edges:

In [11]:
file_path = '../03_clean_dbs/clean_relevant_db.csv'
relevant_df = pd.read_csv(file_path, index_col=0, encoding='utf-8')
relevant_df[DATES_STRINGS] = relevant_df[DATES_STRINGS].astype(int) # Change types of columns from float to int
relevant_df['id'] = relevant_df.index # Create an ID column for Gephi's usage
relevant_df['total_relevance'] = relevant_df['ukraine_relevance'] + relevant_df['russia_relevance']
relevant_df.rename(columns={'value': 'Label'}, inplace=True) # Replace column name for Gephi's format
pointers_set = set(relevant_df['Label']) # Unique set of pointers for future usage.
relevant_df['pointers'] = relevant_df['pointers'].astype(str)

def convert_pointers_to_ids(pointers_string):
  # This function converts pointers from a list of string to a list of pointers,
  # in order to minimize the size of the files.
  pointers_list = pointers_string.split(', ') # Create a list of strings from the pointers
  pointers_in_df_set = set(pointers_list) & set(pointers_set) # Leave only the pointers which exist under the Labels column.
  pointers_ids = [relevant_df.loc[relevant_df['Label'] == pointer, 'id'].iloc[0] for pointer in pointers_in_df_set] # Convert to a list of ids.
  return pointers_ids

relevant_df['pointers_ids'] = relevant_df['pointers'].apply(convert_pointers_to_ids)
relevant_df.sort_values('id', inplace=True)
relevant_df.head()

Unnamed: 0,Label,ukraine_relevance,russia_relevance,pointers,20180401-20180501,20180501-20180601,20190901-20191001,20191001-20191101,20220101-20220201,20220201-20220301,id,total_relevance,pointers_ids
0,Ukrainia,437,135,"Architecture of Poland, Great Purge, Mykola Kh...",218,173,185,161,394,1331,0,572,"[6513, 2859, 2160, 4, 2416, 1372, 1106, 172, 4..."
1,Ukraine,436,135,"Architecture of Poland, Great Purge, Mykola Kh...",228629,259346,320373,305501,1342625,10093079,1,571,"[6513, 2859, 2160, 4, 2416, 1372, 1106, 172, 4..."
2,War in Donbass,368,306,"Soledar, Kyiv Post, Post-Soviet conflicts, Tri...",72012,79062,77510,78256,232111,1185121,2,674,"[117, 33, 5421, 1278, 6492, 6626, 1614, 863, 1..."
3,RussiaâUkraine relations,349,332,"Lazar Kaganovich, Belarusians, Rivne Nuclear P...",7618,7754,12557,10797,131310,1836700,3,681,"[33, 5591, 4, 2, 1936, 1614, 1219, 1314, 1875,..."
4,Ukrainian language,346,129,"Russian Empire Census, Great Purge, Turkic peo...",35159,40111,35751,38397,77086,431072,4,475,"[6513, 573, 1197, 3539, 121, 1517, 4294, 7049,..."


Create the Nodes & Edges datasets for every pageviews dates range:

In [12]:
def create_edges_df_from_nodes_df(nodes_df):
    edges_dict = {"Source": [], "Target": []} # Dict which will be used to generate the DF

    values = nodes_df['id'].to_list() # Get a list of all the values
    values_set = set(values) # Create a set of unique values in this DF
    # Create a list of lists, which will contain all of the pointers:
    pointers = nodes_df['pointers_ids'] #.apply(lambda x: x.split(", ")).to_list()
    for i in range(len(values)): # Run through all indexes of values
        pointers_in_df_set = set(values_set) & set(pointers.iloc[i]) # Make sure there aren't any pointers which don't have matching values in this DF
        for item in pointers_in_df_set: # and on each item on every pointers sublist
            # if item in values:   # if the item on the pointers sublist is on the values list
            edges_dict["Source"].append(values[i]) # Add the value as the source
            edges_dict["Target"].append(item)      # and the item as the target

    edges_df = pd.DataFrame(edges_dict) # Create a dict with two columns - 'Source' & 'Target'
    return edges_df

def create_nodes_df_for_date_range(df, date_string):
     # Get the relevant columns from the base DF where the pageviews aren't zero:
    nodes_df = df[['id', 'Label', 'total_relevance', date_string, 'pointers_ids']][df[date_string] != 0].rename(columns={date_string: 'pageviews'})
    return nodes_df

# Create two dictionaries with the dates range as the keys and the corresponding DFs as the values:
nodes = {dates_string: create_nodes_df_for_date_range(relevant_df, dates_string) for dates_string in DATES_STRINGS}
edges = {dates_string: create_edges_df_from_nodes_df(nodes[dates_string]) for dates_string in DATES_STRINGS}

Export the datasets into CSVs to be used inside of Gephi:

In [13]:
for dates_string, nodes_df in nodes.items(): # Create all of the Nodes CSVs based on the DFs
  nodes_df.iloc[:,:-1].to_csv(f'{dates_string}_nodes_full.csv', index=False, encoding='utf-8')
for dates_string, edges_df in edges.items(): # Create all of the Edges CSVs based on the DFs
  edges_df.to_csv(f'{dates_string}_edges_full.csv', index=False, encoding='utf-8')