In [1]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re


################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting testutility.py


In [2]:
%%writefile file.yaml
file_type: csv
dataset_name: testfile
file_name: data
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - number
    - id 
    - cell_id
    - source
    - ancestor_id
    - pct_rank

Overwriting file.yaml


In [3]:
import pandas as pd
pd.read_csv("data.csv")

  pd.read_csv("data.csv")


Unnamed: 0.1,Unnamed: 0,id,cell_id,source,ancestor_id,pct_rank
0,1,0944b58318b789,fced0b7a,# Spectrogram-based CNN for the Tensorflow Spe...,0212b702,0.000000
1,3,59958672e3bf59,9b68df7e,# A simple explanation and implementation of D...,8f0f9cda,0.000000
2,4,b22e24942614c9,fa089df9,Name: Aviral Jain | \nRoll No: 18AG3AI08 | \nB...,52b2390a,0.000000
3,5,59959edc72f7c1,815f8d1f,"<img src=""https://drive.google.com/uc?export=d...",2ba4272a,0.000000
4,7,599548cea78ff7,7b084d0a,# DATA UNDESRTANDING\n\n#### Churn veri seti i...,ce055a99,0.000000
...,...,...,...,...,...,...
2166063,6370638,d9cd4b84d376a6,a27c4c8c,"<span class=text markdown=""1"">\n\n1. <a id='ci...",657eac68,0.998703
2166064,6370640,0a0ed7e3e80ff4,8f223df6,Hence the final savings made with our model is...,4a20c2b4,0.998726
2166065,6370642,c8e304fd3f7790,1d8fc9ec,## Final models were chosen by Kaggle automati...,b703eac8,0.998853
2166066,6370643,3e406a0098a034,dedec350,# End,3571ebc0,0.998879


In [4]:
import testutility as util
config_data = util.read_config_file("file.yaml")

In [5]:
config_data['inbound_delimiter']

','

In [6]:
config_data

{'file_type': 'csv',
 'dataset_name': 'testfile',
 'file_name': 'data',
 'table_name': 'edsurv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['number', 'id', 'cell_id', 'source', 'ancestor_id', 'pct_rank']}

In [7]:
# read the file using config file
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
#print("",source_file)
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

  df = pd.read_csv(source_file,config_data['inbound_delimiter'])
  df = pd.read_csv(source_file,config_data['inbound_delimiter'])


Unnamed: 0.1,Unnamed: 0,id,cell_id,source,ancestor_id,pct_rank
0,1,0944b58318b789,fced0b7a,# Spectrogram-based CNN for the Tensorflow Spe...,0212b702,0.0
1,3,59958672e3bf59,9b68df7e,# A simple explanation and implementation of D...,8f0f9cda,0.0
2,4,b22e24942614c9,fa089df9,Name: Aviral Jain | \nRoll No: 18AG3AI08 | \nB...,52b2390a,0.0
3,5,59959edc72f7c1,815f8d1f,"<img src=""https://drive.google.com/uc?export=d...",2ba4272a,0.0
4,7,599548cea78ff7,7b084d0a,# DATA UNDESRTANDING\n\n#### Churn veri seti i...,ce055a99,0.0


In [12]:
df.columns

Index(['unnamed_0', 'id', 'cell_id', 'source', 'ancestor_id', 'pct_rank'], dtype='object')

In [13]:
df = df.rename(columns = {'unnamed_0': 'number'})

In [14]:
#validate the header of the file
util.col_header_val(df,config_data)

column name and column length validation passed


1

In [15]:
df.columns

Index(['number', 'id', 'cell_id', 'source', 'ancestor_id', 'pct_rank'], dtype='object')

In [16]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['number', 'id', 'cell_id', 'source', 'ancestor_id', 'pct_rank'], dtype='object')
columns of YAML are: ['number', 'id', 'cell_id', 'source', 'ancestor_id', 'pct_rank']
