# Network Analysis with BigQuery
## Bachelor Thesis Data Analysis

This notebook contains templates and utilities for analyzing social network data stored in BigQuery.

In [None]:
# Enable inline plotting
%matplotlib inline

## 1. Setup and Configuration

Import libraries and set up BigQuery connections.

In [None]:
# Data handling and analysis
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# BigQuery
from google.cloud import bigquery
from google.cloud.exceptions import GoogleCloudError

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.colors import LinearSegmentedColormap

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.family'] = 'serif'
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14

# Define colors
colors = ['#4C72B0', '#55A868', '#C44E52', '#8172B2', '#CCB974']
sns.set_palette(sns.color_palette(colors))

In [None]:
# BigQuery configuration
project_id = "grounded-nebula-408412"
dataset = "python_src"

# Initialize BigQuery client
client = bigquery.Client(project=project_id)

## 2. BigQuery Utilities

This section includes utility functions for working with BigQuery:
- List all tables in the dataset
- Retrieve the schema of a specific table

In [None]:
def list_tables():
    """List all tables in the dataset."""
    try:
        dataset_ref = client.dataset(dataset)
        tables = list(client.list_tables(dataset_ref))
        table_names = [table.table_id for table in tables]
        print("Tables in dataset:")
        for table in table_names:
            print(f"- {table}")
    except GoogleCloudError as e:
        print(f"Error listing tables: {str(e)}")

In [None]:
def get_table_schema(table_id):
    """Retrieve the schema of a given BigQuery table."""
    try:
        table_ref = client.dataset(dataset).table(table_id)
        table = client.get_table(table_ref)
        schema_df = pd.DataFrame([
            {"Column Name": field.name, "Data Type": field.field_type, "Mode": field.mode}
            for field in table.schema
        ])
        return schema_df
    except GoogleCloudError as e:
        print(f"Error fetching schema: {str(e)}")
        return pd.DataFrame()

### Example Usage

To list all tables in the dataset:
```python
list_tables()
```

To get the schema of a specific table:
```python
get_table_schema('your_table_name')
```

## 3. Query Execution Function

In [None]:
def run_query(query, use_cache=True):
    """Execute a BigQuery query and return results as a DataFrame."""
    try:
        job_config = bigquery.QueryJobConfig(use_query_cache=use_cache)
        query_job = client.query(query, job_config=job_config)
        return query_job.to_dataframe()
    except GoogleCloudError as e:
        print(f"Error executing query: {str(e)}")
        return pd.DataFrame()

## 4. Network Size Visualization

In [None]:
# Query and visualize network size over time
size_query = f"""
SELECT month_start, nodes, edges, density
FROM `{project_id}.{dataset}.python_network_climate_network_metrics`
ORDER BY month_start
"""
size_df = run_query(size_query)

# Convert date column
size_df['month_start'] = pd.to_datetime(size_df['month_start'])

# Plot network size metrics
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(size_df['month_start'], size_df['nodes'], marker='o', color=colors[0], label='Nodes')
ax.plot(size_df['month_start'], size_df['edges'], marker='s', color=colors[1], label='Edges')
ax.set_title('Climate Network Size Over Time')
ax.set_xlabel('Month')
ax.set_ylabel('Count')
ax.legend()

# Create secondary y-axis for density
ax2 = ax.twinx()
ax2.plot(size_df['month_start'], size_df['density'], color=colors[2], linestyle='--', marker='^', label='Density')
ax2.set_ylabel('Density', color=colors[2])
ax2.tick_params(axis='y', colors=colors[2])

# Format x-axis dates
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()