In [1]:
import pandas as pd
import numpy as np
import re
# import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
SAMPLE_INDEX = 9

In [3]:
def get_headerless_vcf_df(f_obj, stop_on='#CHROM'):
    line = ''
    header = ''
    while True:
        line = f_obj.readline()
        header += line
        if line[:6] == stop_on:
            break
            
#     print(line)
            
    df = pd.read_csv(f_obj, sep='\t', header=None)
    df.columns = line[1:].replace('""', '').replace('\n', '').split('\t')

    return df, header

In [4]:
def get_calls_matrix(vcf_fp):
    f_obj = open(vcf_fp)
    
    line = ''
    while True:
        line = f_obj.readline()
        if line[:6] == '#CHROM':
            break
    
    line = line.replace('\n', '')
    samples = line.split('\t')[SAMPLE_INDEX:]
    
    X = []
    for line in f_obj:
        line = line.replace('\n', '')
        pieces = line.split('\t')
        X.append(pieces[SAMPLE_INDEX:])
    
    return np.asarray(X), samples

### assumptions

sample vcf is same length as genomes vcf

In [5]:
CALLED_VCF_FP = '/diskmnt/Projects/Users/estorrs/data/ancestry/MM/temp/called_samples.vcf'
GENOMES_VCF_FP = '/diskmnt/Projects/Users/estorrs/1000-genomes/GRCh37/all.coding.sorted.02maf.10000sampled.sorted.snps.vcf'

In [53]:
def preprocess_vcf(vcf_fp):
    """Pull out X, convert to 
    Returns:
        (X, scaler)
    """
    calls_matrix, samples = get_calls_matrix(vcf_fp)
    
    for i, row in enumerate(calls_matrix):
        row = [re.sub(re.compile(r'^0\|[1-9]+|^[1-9]+\|0'), r'0|1', v) for v in row]
        calls_matrix[i] = [re.sub(re.compile(r'^[1-9]+\|[1-9]+'), r'1|1', v) for v in row]
        
    # encode calls
    label_encoder = LabelEncoder()
    label_encoder.fit(['.|.', '0|0', '0|1', '1|1', '0', '1'])
    
    for i, row in enumerate(calls_matrix):
        calls_matrix[i] = label_encoder.transform(row)
    
#     print(ca)
    
    calls_matrix = np.transpose(calls_matrix)
    
    return calls_matrix, samples

def get_preprocessed_X(vcf_fp):
    df, _ = get_headerless_vcf_df(open(vcf_fp))
    
    # remove x
    df = df[df['CHROM'] != 'X']
    df = df[df['CHROM'] != 'chrX']


    
    trimmed_df = df[df.columns[9:]]
    samples = trimmed_df.columns
    
    trimmed_df = trimmed_df.replace(re.compile(r'^0\|[1-9]+|^[1-9]+\|0'), '0|1')
    trimmed_df = trimmed_df.replace(re.compile(r'^[1-9]+\|[1-9]+'), '1|1')
    
#     return trimmed_df
    # for X chrom
#     trimmed_df = trimmed_df.replace(re.compile(r'^[1-9]+$'), '1')
    
    # encode genotype
    label_encoder = LabelEncoder()
#     label_encoder.fit(['.|.', '0|0', '0|1', '1|1', '0', '1'])
    label_encoder.fit(['.|.', '0|0', '0|1', '1|1'])

    # encode the rows
    for i, row in trimmed_df.iterrows():
        trimmed_df.loc[i][:] = label_encoder.transform(row.values)
        
    X = trimmed_df.values

    # transpose so each row is now a sample
    X = X.transpose()
    
    return X, label_encoder, samples, trimmed_df, df
    

In [54]:
X, encoder, samples, trimmed_df, df = get_preprocessed_X(GENOMES_VCF_FP)

In [55]:
X.shape, len(samples)

((2504, 9334), 2504)

In [56]:
X

array([[1, 1, 1, ..., 2, 1, 1],
       [2, 3, 1, ..., 2, 1, 1],
       [2, 2, 1, ..., 2, 1, 3],
       ...,
       [1, 2, 1, ..., 3, 1, 1],
       [1, 1, 1, ..., 2, 1, 1],
       [1, 1, 1, ..., 3, 2, 1]], dtype=object)

In [57]:
df['CHROM']

0       17
2        6
3        1
4       19
5       17
6       15
7       14
8       18
9       17
10       3
11      19
12       3
13      15
14       1
15      11
16       8
17       2
18       5
19      12
20      10
21      10
22      16
23      19
24       4
25       1
26      15
27       9
28      11
29       5
30      11
        ..
9500    19
9501     5
9502     1
9503    22
9504     1
9505     4
9506     3
9508    12
9509     9
9510     2
9511     9
9512    14
9513     1
9514     4
9515    14
9516    19
9517    14
9518    11
9519    14
9520     1
9521    11
9522    14
9523     4
9524     8
9525    20
9526    10
9527    17
9528    19
9529    22
9530     4
Name: CHROM, Length: 9334, dtype: object