# H1: Network Metrics Correlation Analysis

This notebook analyzes correlations between various network metrics from our Twitter dataset.

In [10]:
# Data handling and analysis
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from scipy import stats

# BigQuery
from google.cloud import bigquery
from google.cloud.exceptions import GoogleCloudError

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.colors import LinearSegmentedColormap

# Set up environment
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/zetta/projects/twitter-analysis-python/.secrets/service-account.json'

In [11]:
# Create BigQuery client
client = bigquery.Client()

# Output directory setup
OUTPUT_DIR = "outputs"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Dataset configurations
DATASETS = {
    'all': {
        'table': 'grounded-nebula-408412.twitter_analysis_00_source_python.network_metrics_all',
        'name': 'All Topics Network',
        'output_prefix': 'all'
    },
    'climate': {
        'table': 'grounded-nebula-408412.twitter_analysis_00_source_python.network_metrics_climate',
        'name': 'Climate Network',
        'output_prefix': 'climate'
    },
    'migration': {
        'table': 'grounded-nebula-408412.twitter_analysis_00_source_python.network_metrics_migration',
        'name': 'Migration Network',
        'output_prefix': 'migration'
    }
}

# Plotting configurations
PLOT_FIGURE_SIZE = (12, 12 / 16 * 9)
PLOT_DPI = 300
BASE_FONT_SIZE = 14

# Configure plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = PLOT_FIGURE_SIZE
plt.rcParams['font.size'] = BASE_FONT_SIZE

# Selected metrics for analysis
SELECTED_METRICS = [
    "modularity",
    "network_avg_toxicity",
    "transitivity",
    "assortativity",
    "max_core_number",
    "rich_club_coefficient",
    "average_clustering",
    "connected_components",
    "density",
]

In [12]:
# Define all helper functions
def run_query(query, use_cache=True):
    """Execute a BigQuery query and return results as a DataFrame."""
    try:
        job_config = bigquery.QueryJobConfig(use_query_cache=use_cache)
        query_job = client.query(query, job_config=job_config)
        results_df = query_job.to_dataframe(create_bqstorage_client=False)
        print(f"Query executed successfully. Retrieved {len(results_df)} rows.")
        return results_df
    except Exception as e:
        print(f"Error executing query: {str(e)}")
        return pd.DataFrame()

def normalize_series(series):
    """Normalize a series using z-score normalization."""
    return (series - series.mean()) / series.std()

def format_plot(ax, title=None, xlabel=None, ylabel=None, ylim_start=0, ylim_end=None):
    """Apply standard formatting to a matplotlib axis."""
    if title:
        ax.set_title(title, fontweight='regular', pad=15, fontsize=BASE_FONT_SIZE + 2)
    if xlabel:
        ax.set_xlabel(xlabel, fontsize=BASE_FONT_SIZE)
    if ylabel:
        ax.set_ylabel(ylabel, fontsize=BASE_FONT_SIZE)
    if ylim_end is not None:
        ax.set_ylim(ylim_start, ylim_end)
    elif ylim_start > 0:
        ax.set_ylim(bottom=ylim_start)
    
    ax.grid(True, linestyle="--", alpha=0.5, color="#E0E0E0")
    ax.set_axisbelow(True)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    return ax

In [13]:
# Define statistical analysis functions
def calculate_statistical_measures(df, metrics, normalize=True):
    """Calculate various statistical measures between metrics."""
    results = {}
    
    for i, metric1 in enumerate(metrics):
        results[metric1] = {}
        for metric2 in metrics[i + 1:]:
            x = df[metric1].values
            y = df[metric2].values
            
            if normalize:
                x = normalize_series(pd.Series(x))
                y = normalize_series(pd.Series(y))
            
            pearson_r, pearson_p = stats.pearsonr(x, y)
            spearman_r, spearman_p = stats.spearmanr(x, y)
            
            z = np.polyfit(x, y, 1)
            p = np.poly1d(z)
            y_pred = p(x)
            r_squared = 1 - np.sum((y - y_pred) ** 2) / np.sum((y - y.mean()) ** 2)
            
            results[metric1][metric2] = {
                'pearson_r': pearson_r,
                'pearson_p': pearson_p,
                'spearman_r': spearman_r,
                'spearman_p': spearman_p,
                'r_squared': r_squared,
                'slope': z[0],
                'intercept': z[1]
            }
    
    return results

def calculate_lagged_correlations(df, metrics, max_lag=3, normalize=True):
    """Calculate lagged correlations between selected metrics."""
    lag_correlations = {}
    
    for metric1 in metrics:
        lag_correlations[metric1] = {}
        for metric2 in metrics:
            if metric1 != metric2:
                lag_correlations[metric1][metric2] = []
                for lag in range(max_lag + 1):
                    if lag == 0:
                        x = df[metric1].values
                        y = df[metric2].values
                    else:
                        x = df[metric1][lag:].values
                        y = df[metric2][:-lag].values
                    
                    if normalize:
                        x = normalize_series(pd.Series(x))
                        y = normalize_series(pd.Series(y))
                    
                    z = np.polyfit(x, y, 1)
                    p = np.poly1d(z)
                    y_pred = p(x)
                    r_squared = 1 - np.sum((y - y_pred) ** 2) / np.sum((y - y.mean()) ** 2)
                    corr = stats.spearmanr(x, y)[0]
                    
                    lag_correlations[metric1][metric2].append({
                        'lag': lag,
                        'correlation': corr,
                        'r_squared': r_squared
                    })
    
    return lag_correlations

In [14]:
# Define Kruskal-Wallis test function
def perform_kruskal_wallis_test(datasets_data):
    """
    Perform Kruskal-Wallis H-test across different network datasets for each metric.
    
    Args:
        datasets_data (dict): Dictionary containing DataFrames for each dataset
        
    Returns:
        dict: Dictionary containing test results for each metric
    """
    results = {}
    
    for metric in SELECTED_METRICS:
        # Prepare data for the test
        metric_data = [df[metric].dropna() for df in datasets_data.values()]
        dataset_names = list(datasets_data.keys())
        
        # Perform Kruskal-Wallis H-test
        h_statistic, p_value = stats.kruskal(*metric_data)
        
        # Store results
        results[metric] = {
            'h_statistic': h_statistic,
            'p_value': p_value,
            'dataset_sizes': [len(data) for data in metric_data],
            'dataset_names': dataset_names
        }
        
        # Calculate mean ranks for each group
        all_data = np.concatenate(metric_data)
        all_ranks = stats.rankdata(all_data)
        
        current_pos = 0
        mean_ranks = []
        for data in metric_data:
            group_ranks = all_ranks[current_pos:current_pos + len(data)]
            mean_ranks.append(np.mean(group_ranks))
            current_pos += len(data)
            
        results[metric]['mean_ranks'] = dict(zip(dataset_names, mean_ranks))
    
    return results

In [15]:
def run_query_for_dataset(dataset_config):
    """Execute a BigQuery query for a specific dataset and return results as a DataFrame."""
    query = f"""
    SELECT 
        month_start,
        nodes,
        edges,
        density,
        connected_components,
        transitivity,
        modularity,
        modularity_classes,
        assortativity,
        network_avg_toxicity,
        median_node_toxicity,
        max_core_number,
        avg_core_number,
        rich_club_coefficient,
        average_clustering
    FROM `{dataset_config['table']}`
    ORDER BY month_start
    """
    
    try:
        job_config = bigquery.QueryJobConfig(use_query_cache=True)
        query_job = client.query(query, job_config=job_config)
        results_df = query_job.to_dataframe(create_bqstorage_client=False)
        print(f"Query executed successfully for {dataset_config['name']}. Retrieved {len(results_df)} rows.")
        return results_df
    except Exception as e:
        print(f"Error executing query for {dataset_config['name']}: {str(e)}")
        return pd.DataFrame()

In [16]:
# Load data for each dataset
datasets_data = {}
for dataset_key, dataset_config in DATASETS.items():
    df = run_query_for_dataset(dataset_config)
    if not df.empty:
        datasets_data[dataset_key] = df

Query executed successfully for All Topics Network. Retrieved 24 rows.
Query executed successfully for Climate Network. Retrieved 24 rows.
Query executed successfully for Migration Network. Retrieved 24 rows.


In [8]:
# Perform Kruskal-Wallis tests
kw_results = perform_kruskal_wallis_test(datasets_data)

# Create a summary DataFrame
summary_rows = []
for metric, result in kw_results.items():
    row = {
        'Metric': metric,
        'H-statistic': result['h_statistic'],
        'p-value': result['p_value'],
        'Significant': result['p_value'] < 0.05
    }
    # Add mean ranks for each dataset
    for dataset, rank in result['mean_ranks'].items():
        row[f'{dataset}_mean_rank'] = rank
    summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values('p-value')

# Display results
print("\nKruskal-Wallis Test Results:")
print("============================")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(summary_df.to_string(index=False))


Kruskal-Wallis Test Results:
               Metric  H-statistic      p-value  Significant  all_mean_rank  climate_mean_rank  migration_mean_rank
 connected_components    54.577055 1.408456e-12         True      60.500000          16.375000            32.625000
   average_clustering    47.461377 4.941887e-11         True      60.500000          23.458333            25.541667
         transitivity    46.257420 9.022541e-11         True      59.958333          27.833333            21.708333
      max_core_number    44.473981 2.200886e-10         True      59.416667          21.729167            28.354167
           modularity    38.859209 3.646113e-09         True      14.833333          45.750000            48.916667
rich_club_coefficient    36.719368 1.062895e-08         True      57.625000          26.541667            25.333333
              density    29.229642 4.496388e-07         True      24.250000          55.041667            30.208333
 network_avg_toxicity    14.303843 7.83357

In [9]:
# Create visualization of results
metrics = summary_df['Metric'].tolist()
datasets = [col.replace('_mean_rank', '') for col in summary_df.columns if col.endswith('_mean_rank')]
rank_data = np.zeros((len(metrics), len(datasets)))

for i, metric in enumerate(metrics):
    for j, dataset in enumerate(datasets):
        rank_data[i, j] = summary_df.loc[summary_df['Metric'] == metric, f'{dataset}_mean_rank'].values[0]

# Create heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(rank_data, 
            xticklabels=datasets,
            yticklabels=metrics,
            cmap='YlOrRd',
            annot=True,
            fmt='.1f')
plt.title('Mean Ranks by Dataset and Metric')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'kruskal_wallis_heatmap.png'), dpi=PLOT_DPI, bbox_inches='tight')
plt.close()

# Save results to CSV
summary_df.to_csv(os.path.join(OUTPUT_DIR, 'kruskal_wallis_results.csv'), index=False) 