In [1]:
import numpy as np
import pandas as pd
import pickle

In [None]:
# download sample information if needed
!wget -P data/ http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel

In [2]:
# load samples list and genotypes matrix
samples = np.load('data/samples.npy')
print('Number of samples:', len(samples))

Number of samples: 2504


In [5]:
sample_info = {} # key = sample_id, value = [population, super_population, gender]

# read file with demographic info for each sample
with open('data/integrated_call_samples_v3.20130502.ALL.panel', 'r') as file:
    
    next(file) # skip header

    for line in file: # extract info and add to dictionary
        columns = line.strip().split('\t')
        sample_info[columns[0]] = [columns[1], columns[2], columns[3]]

# save the sample_info dictionary to a file using pickle
with open('data/sample_info.pkl', 'wb') as file:
    pickle.dump(sample_info, file)

In [9]:
# create a dataframe for population counts
populations = [sample_info.get(sample_id, [None, None, None])[0] for sample_id in samples]
population_counts = pd.DataFrame({'Population': populations}).value_counts().reset_index()
population_counts.columns = ['Population', 'Count']
population_counts = population_counts.sort_values(by='Population')

# create a dataframe for superpopulation counts
superpopulations = [sample_info.get(sample_id, [None, None, None])[1] for sample_id in samples]
superpopulation_counts = pd.DataFrame({'Superpopulation': superpopulations}).value_counts().reset_index()
superpopulation_counts.columns = ['Superpopulation', 'Count']
superpopulation_counts = superpopulation_counts.sort_values(by='Superpopulation')

# create a dataframe for gender counts
genders = [sample_info.get(sample_id, [None, None, None])[2] for sample_id in samples]
gender_counts = pd.DataFrame({'Gender': genders}).value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']
gender_counts = gender_counts.sort_values(by='Gender')

# export counts to csv
population_counts.to_csv(f'data/csvs/population_counts.csv', index=False)
superpopulation_counts.to_csv(f'data/csvs/superpopulation_counts.csv', index=False)
gender_counts.to_csv(f'data/csvs/gender_counts.csv', index=False)

In [4]:
# define colors for each superpopulation
superpopulation_colors = {
    'AFR': 'red',
    'AMR': 'blue',
    'EAS': 'green',
    'EUR': 'orange',
    'SAS': 'purple',
}

# define colors for each subpopulation
subpopulation_colors = {
    'ACB':'#ff7f0e',
    'ASW':'#2ca02c',
    'BEB':'#d62728',
    'CDX':'#9467bd',
    'CEU':'#8c564b',
    'CHB':'#e377c2',
    'CHS':'#7f7f7f',
    'CLM':'#bcbd22',
    'ESN':'#6b6ecf',
    'FIN':'#9edae5',
    'GBR':'#dbdb8d',
    'GIH':'#c7c7c7',
    'GWD':'#f7b6d2',
    'IBS':'#c49c94',
    'ITU':'#f1b0b0',
    'JPT':'#e7ba52',
    'KHV':'#7b4173',
    'LWK':'#a55194',
    'MSL':'#843c39',
    'MXL':'#5254a3',
    'PEL':'#17becf',
    'PJL':'#9c9ede',
    'PUR':'#637939',
    'STU':'#b5cf6b',
    'TSI':'#1f77b4',
    'YRI':'#d6616b',
}

# save the color dictionaries to files using pickle
with open('data/superpopulation_colors.pkl', 'wb') as file:
    pickle.dump(superpopulation_colors, file)
with open('data/subpopulation_colors.pkl', 'wb') as file:
    pickle.dump(subpopulation_colors, file)

| Population                                | Superpopulation                         | Color    |
|-------------------------------------------|-----------------------------------------|----------|
| ACB (African Caribbean in Barbados)        | AFR (African)                           | #ff7f0e  |
| ASW (African Ancestry in Southwest USA)    | AFR (African)                           | #2ca02c  |
| BEB (Bengali in Bangladesh)                | SAS (South Asian)                       | #d62728  |
| CDX (Chinese Dai in Xishuangbanna, China)  | EAS (East Asian)                        | #9467bd  |
| CEU (Utah Residents with Northern and Western European Ancestry) | EUR (European)         | #8c564b  |
| CHB (Han Chinese in Beijing, China)        | EAS (East Asian)                         | #e377c2  |
| CHS (Southern Han Chinese)                 | EAS (East Asian)                         | #7f7f7f  |
| CLM (Colombian in Medellin, Colombia)      | AMR (Ad Mixed American)                  | #bcbd22  |
| ESN (Esan in Nigeria)                      | AFR (African)                            | #1f77b4  |
| FIN (Finnish in Finland)                   | EUR (European)                           | #9edae5  |
| GBR (British in England and Scotland)      | EUR (European)                           | #dbdb8d  |
| GIH (Gujarati Indian in Houston, USA)      | SAS (South Asian)                        | #c7c7c7  |
| GWD (Gambian in Western Divisions in the Gambia) | AFR (African)                         | #f7b6d2  |
| IBS (Iberian Population in Spain)          | EUR (European)                           | #c49c94  |
| ITU (Indian Telugu in the UK)              | SAS (South Asian)                        | #f1b0b0  |
| JPT (Japanese in Tokyo, Japan)             | EAS (East Asian)                         | #e7ba52  |
| KHV (Kinh in Ho Chi Minh City, Vietnam)    | EAS (East Asian)                         | #7b4173  |
| LWK (Luhya in Webuye, Kenya)               | AFR (African)                            | #a55194  |
| MSL (Mende in Sierra Leone)                | AFR (African)                            | #843c39  |
| MXL (Mexican Ancestry from Los Angeles, USA) | AMR (Ad Mixed American)                | #5254a3  |
| PEL (Peruvian in Lima, Peru)               | AMR (Ad Mixed American)                  | #6b6ecf  |
| PJL (Punjabi in Lahore, Pakistan)          | SAS (South Asian)                        | #9c9ede  |
| PUR (Puerto Rican in Puerto Rico)          | AMR (Ad Mixed American)                  | #637939  |
| STU (Sri Lankan Tamil in the UK)           | SAS (South Asian)                        | #b5cf6b  |
| TSI (Toscani in Italy)                     | EUR (European)                           | #17becf  |
| YRI (Yoruba in Ibadan, Nigeria)            | AFR (African)                            | #ff9896  |
