In [1]:
import pandas as pd
import numpy as np
import os

In [6]:
def calculate_and_save_adjacency_matrices(end_dates, data_dir = './data/processed/d_corr/GT', thresholds = [0.5,0.4,0.6], is_srd = False):
    for threshold in thresholds:
        adj_mat_subfolder = f"threshold-{threshold}"
        adj_mat_path = f'{data_dir}/adjacency_matrices/{adj_mat_subfolder}'
        cor_mat_path = f'{data_dir}'

        if not os.path.exists(adj_mat_path):
            os.makedirs(adj_mat_path, exist_ok=True)

        for end_date in end_dates:
            corr_df = pd.read_csv(f'{cor_mat_path}/{end_date}.csv', index_col=0)
            
            # Convert all columns (except index) to numeric, errors='coerce' will convert non-numeric values to NaN
            corr_df = corr_df.apply(pd.to_numeric, errors='coerce')
            
            # Replace NaN with 0 to avoid comparison issues

            corr_df = corr_df.fillna(0)

            adjacency_matrix = np.where(corr_df.values >= threshold, 1, 0)

            corr_df.iloc[:, :] = adjacency_matrix
            
            corr_df.to_csv(f'./data/processed/d_corr_adjacency_matrices/threshold-{threshold}/{end_date}.csv')
            corr_df.to_pickle(f'./data/processed/d_corr_adjacency_matrices/threshold-{threshold}/{end_date}.pkl')


# For untreated network

In [3]:
gt_df = pd.read_csv('./data/processed/gt.csv', parse_dates=['date'])
dates = gt_df['date'].dt.date
del gt_df

window_width = 30

selection_index_array = [(index, index + window_width) for index in range(len(dates) - window_width)]
end_dates = [dates[index] for (_, index) in selection_index_array]
len(end_dates)

336

In [7]:
calculate_and_save_adjacency_matrices(end_dates=end_dates)

# For treated (SRD) network

In [5]:
srd_df = pd.read_pickle('./data/processed/[03]srd_out/srd_out.pkl')
dates = srd_df['date'].values
del srd_df

window_width = 30

selection_index_array = [(index, index + window_width) for index in range(len(dates) - window_width)]
end_dates = [dates[index] for (_, index) in selection_index_array]
len(end_dates)

FileNotFoundError: [Errno 2] No such file or directory: './data/processed/[03]srd_out/srd_out.pkl'

In [23]:
calculate_and_save_adjacency_matrices(end_dates=end_dates, is_srd = True)

In [4]:

import os
from itertools import combinations

def count_triads_per_term(directory):
    triad_counts = {}
    files = [f for f in os.listdir(directory) if f.endswith('.csv')]

    for file in files:
        df = pd.read_csv(os.path.join(directory, file), index_col=0)
        words = df.columns.tolist()
        
        # Create a dictionary to store connections
        connections = {word: set() for word in words}

        # Populate connections based on the DataFrame
        for i in range(len(words)):
            for j in range(i + 1, len(words)):
                if df.iloc[i, j] == 1:  # Check if there's a connection
                    connections[words[i]].add(words[j])
                    connections[words[j]].add(words[i])

        # Count triads for each word
        for word in words:
            triad_count = 0
            for word1, word2 in combinations(connections[word], 2):
                if word1 in connections and word2 in connections[word1]:
                    triad_count += 1
            
            # Store the count for the word
            if word in triad_counts:
                triad_counts[word] += triad_count // 3  # Each triad is counted 3 times
            else:
                triad_counts[word] = triad_count // 3

    return triad_counts

# Usage
directory_path = r'C:\Users\Cheska Hung\Desktop\Thesis-GT Infodemiology\-Thesis-Infodemiology-GT\covid-19-google-trends-network\scripts\construct_network\data\processed\RSV_adjacency_matrices\threshold-0.5'
triad_counts = count_triads_per_term(directory_path)

# Print the total number of triads for each search term
for term, count in triad_counts.items():
    print(f'{term}: {count}')

flu: 48
cough: 36
fever: 27
headache: 0
lagnat: 17
rashes: 0
sipon: 18
ubo: 14
ecq: 23
face-shield: 11
Frontliners: 3
masks: 0
Quarantine: 38
social-distancing: 31
work-from-home: 21
