In [2]:
import re

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def get_vcf_df(vcf_fp, stop_on='#CHROM'):
    f = open(vcf_fp)

    line = ''
    header = ''
    while True:
        line = f.readline()
        if line[:6] == stop_on:
            break
            
    df = pd.read_csv(f, sep='\t', header=None)
    df.columns = line[1:].strip().replace('""', '').split('\t')
    df.index = df['ID']
    df.index.name = ''
    
    # remove X chromosome
    df = df[df['CHROM'] != 'X']
    
    # grab the columns we want
    df = df[df.columns[9:]]

    # transpose dataframe so samples are rows, mutations are columns
    df = df.transpose()
    
    # replace phased calls
    df = df.replace(re.compile(r'^1\|0'), '0|1')

    sample_ids = list(df.index)
    
    f.close()
    
    return df, sample_ids

In [None]:
abbr_to_full = {
    'EUR': 'european',
    'AFR': 'african',
    'SAS': 'south asian',
    'EAS': 'east asian',
    'AMR': 'south/north american'
}

def get_ancestry_map(map_fp):
    f = open(map_fp)
    
    # dump header
    f.readline()
    
    ancestry_map = {}
    for line in f:
        sample_id, ancestry = line.strip().split('\t')
        
        # replace ancestry with full name
        ancestry = abbr_to_full[ancestry]
        
        ancestry_map[sample_id] = ancestry
        
    
    return ancestry_map