In [None]:
import pandas as pd
import numpy as np
import datatable as dt

# Steps to prepare Hapmap SNP file for LinkImpute imputation

### 1. Convert the hapmap to a numeric format

In [None]:
# Load hapmap file
df = pd.read_csv("file.hmp.txt", sep="\t")
df = df.drop(df.columns[4:11], axis=1)
df.head()

In [None]:
# Keep only single base-pair alleles
# Ended up not being necessary but worth running
df = df[df["alleles"].isin(["A/T", "A/G", "A/C",
                            "T/A", "T/G", "T/C",
                            "G/A", "G/T", "G/A",
                            "C/A", "C/G", "C/T"])]
df.head()

In [None]:
# Convert single bi-allelic single-SNP alleles to numeric
def to_numeric(row):
    row_list = row.tolist()
    # Convert the left allele to 0, right to 1, and missing to -1
    allele_dict = {row_list[1][0]: 0, row_list[1][-1]: 1, "N": -1}
    row_list[4:] = [allele_dict[allele] for allele in row_list[4:]]
    return pd.Series(row_list)
df = df.apply(to_numeric, axis=1)

### 2. Save numeric hapmap array and run LinkImpute (externally)

In [None]:
array_format = df.T
array_format = array_format.drop(array_format.index[:4])
array_format.to_csv("numerical/sorghum.array", sep=" ", index=False, header=False)

### 3. Load the imputation results and the original hapmap file

In [None]:
imp = pd.read_csv("numerical/sorghum_imp.array", sep="\t", header=None).T
df = pd.read_csv("EP_filtered_165883_numeric.txt.hmp.txt", sep="\t")
df = df[df["alleles"].isin(["A/T", "A/G", "A/C",
                            "T/A", "T/G", "T/C",
                            "G/A", "G/T", "G/A",
                            "C/A", "C/G", "C/T"])]
df = df.reset_index(drop=True)
df.iloc[:,11:] = imp
#imp.columns = df.columns[11:]
columns = df.columns # not sure why column names are deleted later
df

### 4. Convert the numerical imputed array back to hapmap format

In [None]:
#After imputation reverse the numeric format to allelic to load hapmap file in TASSEL
def to_allelic(row):
    row_list = row.tolist()
    # Convert the left allele to 0, right to 1, and missing to -1
    allele_dict = {0: row_list[1][0], 1: row_list[1][-1]}
    row_list[11:] = [allele_dict[allele] for allele in row_list[11:]]
    return pd.Series(row_list)

hmp = df.apply(to_allelic, axis=1)
hmp.columns = columns

In [None]:
hmp.to_csv("numerical/sorghum_imp.hmp.txt", sep="\t", index=False, header=True)