In [None]:
from pyspark.sql import SparkSession
import hail as hl
import os
import time
import dxpy
import logging
import pandas as pd
import re

# Had to set the configuration to navigate RDD partition error
# Build spark
builder = (
    SparkSession
    .builder
    .appName("HailApplication")  # Set a meaningful application name
    .config("spark.driver.memory", "12g")  # Set driver memory (e.g., 8 GB)
    .config("spark.executor.memory", "50g")  # Set executor memory (e.g., 16 GB)
    .config("spark.executor.cores", "6")  # Optional: Set number of cores per executor 
    .enableHiveSupport()
)
spark = builder.getOrCreate()

hl.init(sc=spark.sparkContext, idempotent=True)

In [None]:
# Access the Spark context from Hail
spark_conf = hl.current_backend()._jbackend.sc().getConf()

# Retrieve specific Spark configurations
executor_memory = spark_conf.get("spark.executor.memory", "Not set")
driver_memory = spark_conf.get("spark.driver.memory", "Not set")
worker_cores = spark_conf.get("spark.executor.cores", "Not set")

print(f"Executor Memory: {executor_memory}")
print(f"Driver Memory: {driver_memory}")
print(f"Worker Cores: {worker_cores}")

# Checkpoint 4

In [None]:
# checkpoint save
# Create database in DNAX
db_name = f"ancestry_inference"
stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
print(stmt)
spark.sql(stmt).show()

db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
mt_name = f"geno_overlap_filtered.mt"
url = f"dnax://{db_uri}/{mt_name}"


In [None]:
RERUN=False
if RERUN:
    geno_mt.write(url, overwrite=True)
    

In [None]:
geno_mt = hl.read_matrix_table(url)

In [None]:
geno_mt.count()

# Save PCA projections for geno

In [None]:
# checkpoint save
# Create database in DNAX
db_name = f"ancestry_inference"
stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
print(stmt)
spark.sql(stmt).show()

db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
mt_name = f"pca_loadings.ht"
url = f"dnax://{db_uri}/{mt_name}"


In [None]:
RERUN=False
if RERUN:
    loadings_ht.write(url, overwrite=True)


In [None]:
loadings_ht = hl.read_table(url)

In [None]:
loadings_ht.describe()

In [None]:
# Project new genotypes onto loadings

ht = hl.experimental.pc_project(geno_mt.GT, loadings_ht.loadings, loadings_ht.af)

In [None]:
# checkpoint save
# Create database in DNAX
db_name = f"ancestry_inference"
stmt = f"CREATE DATABASE IF NOT EXISTS {db_name} LOCATION 'dnax://'"
print(stmt)
spark.sql(stmt).show()

db_uri = dxpy.find_one_data_object(name=f"{db_name}".lower(), classname="database")['id']
mt_name = f"geno_sample_pca.ht"
url = f"dnax://{db_uri}/{mt_name}"


In [None]:
RERUN=True
if RERUN:
    ht.write(url, overwrite=True)


In [None]:
ht = hl.read_table(url)

In [None]:
ht.describe()

In [None]:
geno_pca_df = ht.to_pandas()

In [None]:
geno_pca_df[[f"pca_{i}" for i in range(1, 21)]] = pd.DataFrame(geno_pca_df.scores.tolist(), index= geno_pca_df.index)


In [None]:
geno_pca_df = geno_pca_df.drop(columns=["scores"])


In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
proj_dir = f"/notebooks/ancestry_inference/data/"
filename = "geno_pca.csv.gz"
geno_pca_df.to_csv(filename, index=False)
upload_file_to_project(filename, proj_dir)



In [None]:
geno_pca_df.head()