In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import decomposition
import altair as alt

Read in copy number variant (CNV) and population information data

In [6]:
# read in cnv data
df = pd.read_csv('hm3_cnv_submission.txt', sep = '\t')

df.head()

Unnamed: 0,cnp_id,chr,start,end,NA06984,NA06985,NA06986,NA06989,NA06991,NA06993,...,NA21733,NA21738,NA21739,NA21740,NA21741,NA21768,NA21776,NA21784,NA21825,NA21826
0,HM3_CNP_1,1,8105049,8112441,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
1,HM3_CNP_2,1,10292133,10300570,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,HM3_CNP_3,1,10466423,10467633,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,HM3_CNP_4,1,12764515,12894420,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0
4,HM3_CNP_5,1,13647613,13649415,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0


In [7]:
# read in sample information into lists

YRI_file = open('YRI_samples.txt', 'r')
JPT_file = open('JPT_samples.txt', 'r')
CEU_file = open('CEU_samples.txt', 'r')
CHB_file = open('CHB_samples.txt', 'r')

YRI_data = YRI_file.read()
YRI = YRI_data.split('\n')

JPT_data = JPT_file.read()
JPT = JPT_data.split('\n')

CEU_data = CEU_file.read()
CEU = CEU_data.split('\n')

CHB_data = CHB_file.read()
CHB = CHB_data.split('\n')

YRI_file.close()
JPT_file.close()
CEU_file.close()
CHB_file.close()

In [12]:
# remove empty string at end of line
YRI.remove('')
JPT.remove('')
CEU.remove('')
CHB.remove('')

Prepare data for PCA

In [24]:
# set CNVs as index
df = df.set_index('cnp_id')

# remove unnecessary columns
df = df.iloc[:, 4:]

# impute NA
## will just fill the NA with 2.0, which is assuming uncalled CNVs have the normal 2 copies
df = df.fillna(2.0)


In [29]:
# transpose the matrix, because the population code for a given sample will be a column, so the samples will need to be the indexes
df = df.T

In [31]:
# write function to add correct population code for each sample
def assign_population(x):
    if x in CHB:
        return 'Chinese (CHB)'
    elif x in CEU:
        return 'European (CEU)'
    elif x in YRI:
        return 'African (YRI)'
    elif x in JPT:
        return 'Japanese (JPT)'
    else:
        return 'Unknown'

df['population_group'] = df.index.map(assign_population)

In [35]:
df.iloc[:,-1].head() #success!

NA06985    European (CEU)
NA06986           Unknown
NA06989           Unknown
NA06991    European (CEU)
NA06993    European (CEU)
Name: population_group, dtype: object