# V/C/P Gene Variant Geographic Map

This notebook visualizes the mean allele frequencies of V/C/P gene variants across geographic locations.

**V/C/P Gene**: The measles P gene encodes three proteins via overlapping reading frames:
- **Phosphoprotein (P)**: Essential for viral RNA synthesis
- **V protein**: Interferon antagonist
- **C protein**: Interferes with host innate immunity

We create three maps:
1. US states
2. Canadian provinces
3. Romania (Bucharest)

In [None]:
import gxy
import pandas as pd
import altair as alt
from vega_datasets import data as vega_data

# Suppress warnings
import warnings
warnings.filterwarnings('ignore', message='.*narwhals.*')
warnings.filterwarnings('ignore')

print("Libraries loaded")

In [None]:
# Dataset IDs - change these to match your history
VARIANTS_PER_SAMPLE = 13433   # Per-sample variants with AF
METADATA_CLEAN = 13681        # Sample metadata (sample_id, location, date)
ANNOTATED_VARIANTS = 13682    # Aggregated variants with gene annotation

In [None]:
# Download all datasets
paths = await gxy.get([VARIANTS_PER_SAMPLE, METADATA_CLEAN, ANNOTATED_VARIANTS])
print(f"Downloaded {len(paths)} files")
for p in paths:
    print(f"  {p}")

In [None]:
# Load per-sample variants (no header)
per_sample_cols = ['Sample', 'CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'DP', 'AF', 'DP4', 'POSREFALT']
per_sample = pd.read_csv(paths[0], sep='\t', names=per_sample_cols, skiprows=1)
print(f"Per-sample variants: {len(per_sample)} rows")

# Load metadata (has header)
metadata = pd.read_csv(paths[1], sep='\t')
print(f"Sample metadata: {len(metadata)} samples")

# Load annotated variants (has header)
annotated = pd.read_csv(paths[2], sep='\t')
print(f"Annotated variants: {len(annotated)} variants")

In [None]:
# Filter to V/C/P gene variants only
vcp_variants = annotated[annotated['gene'] == 'P/V/C'].copy()
print(f"V/C/P gene variants: {len(vcp_variants)}")
print(f"\nBy product:")
print(vcp_variants['product'].value_counts().to_string())

In [None]:
# Get list of V/C/P variant IDs
vcp_var_ids = set(vcp_variants['var_id'].tolist())
print(f"V/C/P variant IDs: {len(vcp_var_ids)}")

# Filter per-sample variants to only V/C/P
per_sample_vcp = per_sample[per_sample['POSREFALT'].isin(vcp_var_ids)].copy()
print(f"Per-sample V/C/P variants: {len(per_sample_vcp)} rows")

In [None]:
# Join with metadata to get location
merged = per_sample_vcp.merge(
    metadata,
    left_on='Sample',
    right_on='sample_id',
    how='inner'
)
print(f"Merged rows: {len(merged)}")
print(f"Unique samples: {merged['Sample'].nunique()}")
print(f"Unique locations: {merged['location'].nunique()}")

In [None]:
# Aggregate by location - calculate mean AF
location_stats = merged.groupby('location').agg(
    mean_af=('AF', 'mean'),
    sample_count=('Sample', 'nunique'),
    variant_count=('POSREFALT', 'count')
).reset_index()

print("=== Mean AF by Location ===")
print(location_stats.sort_values('mean_af', ascending=False).to_string(index=False))

In [None]:
# Parse location into country and state/province
def parse_location(loc):
    if ':' in loc:
        parts = loc.split(':', 1)
        return parts[0], parts[1]
    return loc, loc

location_stats[['country', 'region']] = location_stats['location'].apply(
    lambda x: pd.Series(parse_location(x))
)

# Separate by country
us_data = location_stats[location_stats['country'] == 'USA'].copy()
canada_data = location_stats[location_stats['country'] == 'Canada'].copy()
romania_data = location_stats[location_stats['country'].str.contains('Romania', na=False)].copy()

print(f"US locations: {len(us_data)}")
print(f"Canada locations: {len(canada_data)}")
print(f"Romania locations: {len(romania_data)}")

In [None]:
# === US State Map ===

# Load US states topojson
states = alt.topo_feature(vega_data.us_10m.url, 'states')

# State name to ID mapping (FIPS codes)
state_ids = {
    'Alabama': 1, 'Alaska': 2, 'Arizona': 4, 'Arkansas': 5, 'California': 6,
    'Colorado': 8, 'Connecticut': 9, 'Delaware': 10, 'Florida': 12, 'Georgia': 13,
    'Hawaii': 15, 'Idaho': 16, 'Illinois': 17, 'Indiana': 18, 'Iowa': 19,
    'Kansas': 20, 'Kentucky': 21, 'Louisiana': 22, 'Maine': 23, 'Maryland': 24,
    'Massachusetts': 25, 'Michigan': 26, 'Minnesota': 27, 'Mississippi': 28,
    'Missouri': 29, 'Montana': 30, 'Nebraska': 31, 'Nevada': 32, 'New Hampshire': 33,
    'New Jersey': 34, 'New Mexico': 35, 'New York': 36, 'North Carolina': 37,
    'North Dakota': 38, 'Ohio': 39, 'Oklahoma': 40, 'Oregon': 41, 'Pennsylvania': 42,
    'Rhode Island': 44, 'South Carolina': 45, 'South Dakota': 46, 'Tennessee': 47,
    'Texas': 48, 'Utah': 49, 'Vermont': 50, 'Virginia': 51, 'Washington': 53,
    'West Virginia': 54, 'Wisconsin': 55, 'Wyoming': 56
}

# Add FIPS ID to data
us_data['id'] = us_data['region'].map(state_ids)
us_data = us_data.dropna(subset=['id'])
us_data['id'] = us_data['id'].astype(int)

print("US data with state IDs:")
print(us_data[['region', 'id', 'mean_af', 'sample_count']].to_string(index=False))

In [None]:
# Create US map
us_background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project(
    type='albersUsa'
).properties(
    width=600,
    height=400
)

us_choropleth = alt.Chart(states).mark_geoshape(
    stroke='white'
).encode(
    color=alt.Color(
        'mean_af:Q',
        scale=alt.Scale(scheme='blues'),
        legend=alt.Legend(title='Mean AF')
    ),
    tooltip=[
        alt.Tooltip('region:N', title='State'),
        alt.Tooltip('mean_af:Q', title='Mean AF', format='.4f'),
        alt.Tooltip('sample_count:Q', title='Samples'),
        alt.Tooltip('variant_count:Q', title='Variant Calls')
    ]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(us_data, 'id', ['mean_af', 'region', 'sample_count', 'variant_count'])
).project(
    type='albersUsa'
).properties(
    width=600,
    height=400,
    title='V/C/P Gene Variant Mean AF by US State'
)

us_map = us_background + us_choropleth
us_map

In [None]:
# === Canada Province Map ===

# Canada provinces topojson URL
canada_url = 'https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/canada.geojson'

print("Canada data:")
print(canada_data[['region', 'mean_af', 'sample_count']].to_string(index=False))

In [None]:
# Create Canada bar chart (simpler than choropleth for few provinces)
canada_chart = alt.Chart(canada_data).mark_bar().encode(
    x=alt.X('region:N', title='Province', sort='-y'),
    y=alt.Y('mean_af:Q', title='Mean Allele Frequency'),
    color=alt.Color(
        'mean_af:Q',
        scale=alt.Scale(scheme='greens'),
        legend=None
    ),
    tooltip=[
        alt.Tooltip('region:N', title='Province'),
        alt.Tooltip('mean_af:Q', title='Mean AF', format='.4f'),
        alt.Tooltip('sample_count:Q', title='Samples'),
        alt.Tooltip('variant_count:Q', title='Variant Calls')
    ]
).properties(
    width=400,
    height=300,
    title='V/C/P Gene Variant Mean AF by Canadian Province'
)

canada_chart

In [None]:
# === Romania Visualization ===

print("Romania data:")
print(romania_data[['location', 'mean_af', 'sample_count', 'variant_count']].to_string(index=False))

# Romania has only Bucharest, show as single indicator
romania_chart = alt.Chart(romania_data).mark_bar().encode(
    x=alt.X('location:N', title='Location'),
    y=alt.Y('mean_af:Q', title='Mean Allele Frequency'),
    color=alt.value('coral'),
    tooltip=[
        alt.Tooltip('location:N', title='Location'),
        alt.Tooltip('mean_af:Q', title='Mean AF', format='.4f'),
        alt.Tooltip('sample_count:Q', title='Samples'),
        alt.Tooltip('variant_count:Q', title='Variant Calls')
    ]
).properties(
    width=200,
    height=300,
    title='V/C/P Gene Variant Mean AF - Romania'
)

romania_chart

In [None]:
# === Combined Summary ===

# Combined comparison bar chart
all_regions = pd.concat([us_data, canada_data, romania_data])
all_regions['label'] = all_regions['country'] + ': ' + all_regions['region']

combined_chart = alt.Chart(all_regions).mark_bar().encode(
    x=alt.X('label:N', title='Location', sort='-y'),
    y=alt.Y('mean_af:Q', title='Mean Allele Frequency'),
    color=alt.Color(
        'country:N',
        scale=alt.Scale(domain=['USA', 'Canada', 'Romania'], range=['steelblue', 'seagreen', 'coral']),
        legend=alt.Legend(title='Country')
    ),
    tooltip=[
        alt.Tooltip('label:N', title='Location'),
        alt.Tooltip('mean_af:Q', title='Mean AF', format='.4f'),
        alt.Tooltip('sample_count:Q', title='Samples'),
        alt.Tooltip('variant_count:Q', title='Variant Calls')
    ]
).properties(
    width=700,
    height=400,
    title='V/C/P Gene Variant Mean AF by Region (All Locations)'
)

combined_chart

In [None]:
# === Summary Statistics ===

print("="*60)
print("V/C/P GENE VARIANT ANALYSIS SUMMARY")
print("="*60)

print(f"\nTotal V/C/P variants analyzed: {len(vcp_variants)}")
print(f"Total per-sample variant calls: {len(per_sample_vcp)}")
print(f"Unique samples: {merged['Sample'].nunique()}")
print(f"Unique locations: {len(location_stats)}")

print(f"\n--- By Country ---")
for country in ['USA', 'Canada', 'Romania']:
    subset = all_regions[all_regions['country'].str.contains(country, na=False)]
    if len(subset) > 0:
        print(f"\n{country}:")
        print(f"  Regions: {len(subset)}")
        print(f"  Total samples: {subset['sample_count'].sum()}")
        print(f"  Mean AF range: {subset['mean_af'].min():.4f} - {subset['mean_af'].max():.4f}")

print(f"\n--- Overall ---")
print(f"Overall mean AF: {merged['AF'].mean():.4f}")
print(f"AF std dev: {merged['AF'].std():.4f}")

In [None]:
# Save summary to Galaxy
location_stats.to_csv('vcp_variant_by_location.tsv', sep='\t', index=False)
await gxy.put('vcp_variant_by_location.tsv', output='V/C/P Variants by Location', ext='tabular')
print("Summary saved to Galaxy history!")