In [None]:
import os
import copy
import math

import pandas as pd
import numpy as np

In [None]:
class VCFParcer:
    def __init__(self):
        self.info_columns = dict()

    def info_row_parser(self, row):
        new_row = dict()
        for s in row['INFO'].split(';'):
            result = s.split('=', maxsplit=1)
            if len(result) == 2:
                column, value = result[0], result[1]
                new_row[column] = value
            else:
                column = result[0]
                new_row[column] = None
            if column not in self.info_columns:
                self.info_columns[column] = None
        new_row_ordered = copy.deepcopy(self.info_columns)
        new_row_ordered.update(new_row)
        return new_row_ordered

    def info_parser(self, df):
        return pd.concat(
            [df, df.apply(self.info_row_parser, result_type='expand', axis=1)], 
            axis=1
        ).drop(columns=['INFO'])
    
    def parse(self, path_in: str, path_out: str, chunksize=None, nrows=None):
        names = ['CHR', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
        df = pd.read_table(
            path_in,
            names=names,
            chunksize=chunksize,
            nrows=nrows,
            comment='#',
            sep='\t',
        )
        if chunksize is not None:
            iterations = 0
            for chunk in df:
                self.info_parser(chunk).to_csv(
                    path_out, sep='\t', index=False, header=False, mode='a')
                iterations += 1
                print('COUNT: ', iterations * chunksize)
        else:
            self.info_parser(df).to_csv(
                path_out, sep='\t', index=False)
        return names[:-1] + list(self.info_columns)

In [None]:
path_in = '/uftp/shared/clinvar.vcf'
path_out = '/uftp/shared/clinvar_from_vcf.tsv'

In [None]:
VCFParcer().parse(
    path_in=path_in,
    path_out=path_out,
)

In [None]:
res_df = pd.read_table(path_out)

In [None]:
mc = res_df['MC'].str.split(',')

In [None]:
def expand(writes):
    so_parts = list()
    conseq_names = list()
    if isinstance(writes, list) or pd.notna(writes):
        for x in writes:
            so, consequence_name = x.split('|')
            so_parts.append(so)
            conseq_names.append(consequence_name)
    return {'SO': ', '.join(so_parts), 'consequence': ', '.join(conseq_names)}

so_conseq = pd.DataFrame(mc.apply(expand).to_list())

In [None]:
df_updated = pd.concat([res_df, so_conseq], axis=1)

In [None]:
df_updated.to_csv(path_out, sep='\t', index=False)