In [None]:
!pip install scikit-learn

In [None]:
import os
import time
import dxpy
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Create reference and ukb data

In [None]:
ref_df = pd.read_csv("/mnt/project/notebooks/ancestry_inference/data/ref_pca.csv.gz")
geno_df = pd.read_csv("/mnt/project/notebooks/ancestry_inference/data/geno_pca.csv.gz")

In [None]:
def get_training_data(ref_df, npca=16):
    # first combine nfe and fin
    train_df = ref_df.copy()
    train_df = train_df.loc[train_df.ancestry_pred!="oth"]
    label_mappings = {
        "afr": "afr", "amr": "amr", "eas": "eas",  "mid": "mid",
        "fin": "eur", "nfe": "eur", "sas": "sas", "oth": "oth"
    }
    train_df.loc[:, "label"] = train_df.ancestry_pred.map(label_mappings)
    X_train, y_train = train_df.loc[:, [f"pca_{i}" for i in range(1, npca+1)]], train_df.label
    return X_train.values, y_train.values


In [None]:
X_train, y_train = get_training_data(ref_df)

In [None]:
X_test = geno_df.loc[:, [f"pca_{i}" for i in range(1,17)]].values

# Train random forest classifier

In [None]:
# Step 4: Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Step 5: Define the parameter grid for different PCA feature counts
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10]
}


In [None]:
# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=0)
grid_search.fit(X_train, y_train)

# Best parameters and cross-validation score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")


In [None]:
best_model = grid_search.best_estimator_

In [None]:
best_model['classifier'].classes_

# Predict the probabilities for the 6 ancestries

In [None]:
predicted_prob = best_model.predict_proba(X_test)

In [None]:
predicted_prob_df = pd.DataFrame(data=predicted_prob, index=geno_df.s, columns=best_model['classifier'].classes_)

# Assign ancestries based on predicted prob

In [None]:
def get_ancestry(ser):
    most_likely_anc = ser.loc[ser>0.75].index
    if len(most_likely_anc)>0:
        assert len(most_likely_anc)==1
        most_likely_anc = most_likely_anc[0]
    else:
        most_likely_anc="oth"
    return most_likely_anc
    

In [None]:
predicted_prob_df["ancestry_pred"] = predicted_prob_df.apply(get_ancestry, axis=1)

# Check consistency with self-reported ehtnicity

In [None]:
pheno_df = pd.read_csv("/mnt/project/notebooks/regenie/data/pheno.csv.gz", usecols=["sample_names", "ethnic_background"])

In [None]:
pheno_df = pheno_df.merge(predicted_prob_df, left_on="sample_names", right_index=True)

In [None]:
len(pheno_df)

In [None]:
# Creating a pivot table to count overlaps of ethnic background vs ancestry
pivot_table = pd.pivot_table(pheno_df, 
                             index='ethnic_background', 
                             columns='ancestry_pred', 
                             aggfunc='size', 
                             fill_value=0)

In [None]:
pivot_table

In [None]:
most_likely_ancestry_dict = {
    "African": "afr",
    "Bangladeshi": "sas",
    "British": "eur",
    "Chinese": "eas",
    "Indian": "sas",
    "Irish": "eur",
    "Pakistani": "sas",
    "White": "eur"
}

In [None]:
consistency_df = pheno_df.copy()

In [None]:
consistency_df = consistency_df.loc[consistency_df.ethnic_background.isin(set(most_likely_ancestry_dict.keys()))]

In [None]:
consistency_df["most_likely_ancestry"] = consistency_df.ethnic_background.map(most_likely_ancestry_dict)

In [None]:
accuracy_score(consistency_df.most_likely_ancestry, consistency_df.ancestry_pred)

In [None]:
class_report = classification_report(consistency_df.most_likely_ancestry, consistency_df.ancestry_pred, output_dict=True)

In [None]:
class_resport_df = pd.DataFrame(class_report)

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
class_resport_df

In [None]:
proj_dir = f"/notebooks/ancestry_inference/data/"
filename = "consistency_report.csv"
class_resport_df.to_csv(filename)
upload_file_to_project(filename, proj_dir)

In [None]:
proj_dir = f"/notebooks/ancestry_inference/data/"
filename = "final_run_ancestry_pivot.csv"
pivot_table.to_csv(filename)
upload_file_to_project(filename, proj_dir)

In [None]:
proj_dir = f"/notebooks/ancestry_inference/data/"
filename = "ancestry_pred.csv.gz"
pheno_df.to_csv(filename, index=False)
upload_file_to_project(filename, proj_dir)