In [3]:
%%time

import pandas as pd
import numpy
import os
import tarfile

def genotype_compare(geno_a, geno_b):
  (a1, a2) = geno_a.split("|")
  (b1, b2) = geno_b.split("|")

  if a1 == b1 and a2 == b2:
    return 1
  elif a1 == b1 or a2 == b2:
    return .5
  else:
    return 0

def make_matrix(row, samples):
  array = numpy.zeros(shape=[len(samples), len(samples)])
  for j, sample_a in enumerate(samples):
    for k, sample_b in enumerate(samples):
      array[j,k] = genotype_compare(row[sample_a], row[sample_b])
  return array


def export_matrix(vcf_file, skip_rows):
    # todo: handle skiprows?

    vcf_df = pd.read_csv(vcf_file, sep="\t", skiprows=skip_rows, dtype='string')
    samples = list(vcf_df.columns)[9:]
    vcf_df['matrix'] = vcf_df.apply(make_matrix, axis=1)

    vcf_file_base = os.file.basename(vcf_file)

    # create tar file
    tar = tarfile.open(f"{vcf_file_base}.row_matrices.tar.gz")

    # Now save the files
    for row in range(0,vcf_df.shape[0]):
      row_df = pd.DataFrame(vcf_df.iloc[row]['matrix'], columns=samples, index=samples)
      id = vcf_df.iloc[row]['ID']
      filename = f"{vcf_file_base}.{id}.matrix.tsv"
      row_df.to_csv(filename, sep="\t")
      tar.add(filename)
      os.remove(filename)

    tar.close()


def generate_matrices():
    phenotypes = ["max_height_cm", "max_growth_cm_gdd"]
    pvalues = ["p0001", "p0005", "p001"]

    for phenotype in phenotypes:
        for pvalue in pvalues:
            export_matrix(f"../vcf/sorghum.filtered.season4.season6.{phenotype}_{pvalue}.vcf.gz", skip_rows=83)
            export_matrix(f"../vcf/sorghum.filtered.season4.season6.{phenotype}_{pvalue}_qtl.vcf.gz", skip_rows=84)

generate_matrices()



NameError: name 'samples' is not defined