# BigQuery Query Example - Jupyter Notebook

This notebook demonstrates querying the PHYSICIANS_OVERVIEW table using Python in a Jupyter environment.

## Setup and Import Libraries

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 50)

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## Load Environment Variables and Authenticate

In [None]:
# Load environment variables from common/.env
load_dotenv('../../common/.env')

# Get the path to service account key
credentials_path = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')

# Create credentials object
credentials = service_account.Credentials.from_service_account_file(
    credentials_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

# Initialize BigQuery client with credentials
client = bigquery.Client(
    credentials=credentials,
    project=credentials.project_id,
)

print(f"Connected to project: {client.project}")

## Query the PHYSICIANS_OVERVIEW Table

In [None]:
# Define the query
query = """
SELECT * 
FROM `data-analytics-389803.CONFLIXIS_309340.PHYSICIANS_OVERVIEW` 
LIMIT 100
"""

# Execute query and convert to DataFrame
df = client.query(query).to_dataframe()

print(f"Query returned {len(df)} rows and {len(df.columns)} columns")
print("\nColumn names:")
print(df.columns.tolist())

## Explore the Data

In [None]:
# Display first 5 rows
print("First 5 rows of the data:")
df.head()

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
df.info()

In [None]:
# Display summary statistics for numeric columns
print("Summary Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing values per column:")
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

## Simple Visualization Example

Note: The visualization below is a placeholder. Update it based on the actual columns in your PHYSICIANS_OVERVIEW table.

In [None]:
# Example: If there's a categorical column, show its distribution
# Update this based on your actual data columns

# Get categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if categorical_cols:
    # Plot distribution of first categorical column
    col = categorical_cols[0]
    
    # Count values
    value_counts = df[col].value_counts().head(10)
    
    # Create plot
    plt.figure(figsize=(10, 6))
    value_counts.plot(kind='bar')
    plt.title(f'Top 10 Values in {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("No categorical columns found for visualization")

## Export Results (Optional)

In [None]:
# Save to CSV
# df.to_csv('physicians_overview_sample.csv', index=False)
# print("Data saved to physicians_overview_sample.csv")

# Save to Excel
# df.to_excel('physicians_overview_sample.xlsx', index=False)
# print("Data saved to physicians_overview_sample.xlsx")

## Query with Parameters

Example of using parameterized queries for safety and flexibility:

In [None]:
# Example: Query with a parameter
# Modify this based on your actual table schema

job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ScalarQueryParameter("limit_rows", "INT64", 50),
    ]
)

parameterized_query = """
SELECT * 
FROM `data-analytics-389803.CONFLIXIS_309340.PHYSICIANS_OVERVIEW` 
LIMIT @limit_rows
"""

df_param = client.query(parameterized_query, job_config=job_config).to_dataframe()
print(f"Parameterized query returned {len(df_param)} rows")

## Next Steps

- Explore specific columns relevant to your analysis
- Join with other tables in the dataset
- Create more sophisticated visualizations
- Build analytical models based on the data