In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_plink import read_plink1_bin
import warnings

In [6]:
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

In [7]:
# ===================================
# Cell 1: Load Data (CORRECT METHOD)
# ===================================

# Set data path - need to include .bed extension!
data_prefix = "/home/vcm/BIOSTATS915_HighDimensionalStatisticsAndMachineLearning/HW1/hw-optionB-files/ADAPTmap_genotypeTOP_20160222_full"
bed_file = data_prefix + '.bed'

# Load PLINK format data with correct method
try:
    print("Loading PLINK data...")
    print(f"BED file: {bed_file}")
    goat_data = read_plink1_bin(bed_file, verbose=False)
    print("✓ Data loaded successfully!")
    print(f"Data type: {type(goat_data)}")
except Exception as e:
    print(f"Failed to load data: {e}")
    raise

Loading PLINK data...
BED file: /home/vcm/BIOSTATS915_HighDimensionalStatisticsAndMachineLearning/HW1/hw-optionB-files/ADAPTmap_genotypeTOP_20160222_full.bed
✓ Data loaded successfully!
Data type: <class 'xarray.core.dataarray.DataArray'>


In [12]:
# ===================================
# Cell 2: Data Overview - Correct xarray access
# ===================================

print("\n=== DATASET OVERVIEW ===")
print(f"Number of individuals: {goat_data.sample.shape[0]:,}")
print(f"Number of SNPs: {goat_data.variant.shape[0]:,}")
print(f"Genotype matrix shape: {goat_data.values.shape} (Individuals x SNPs)")

# Access data from xarray coordinates (not converting to pandas)
n_individuals = len(goat_data.sample)
n_populations = len(np.unique(goat_data.sample.fid.values))

print(f"\n=== ASSIGNMENT QUESTIONS ===")
print(f"Q: How many subjects/individuals are in this goat dataset?")
print(f"A: {n_individuals:,} individuals")

print(f"\nQ: How many different populations of goats are in this dataset?")
print(f"A: {n_populations} different populations")


=== DATASET OVERVIEW ===
Number of individuals: 4,653
Number of SNPs: 53,347
Genotype matrix shape: (4653, 53347) (Individuals x SNPs)

=== ASSIGNMENT QUESTIONS ===
Q: How many subjects/individuals are in this goat dataset?
A: 4,653 individuals

Q: How many different populations of goats are in this dataset?
A: 144 different populations
