In [13]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [14]:
import os
import pandas as pd
import numpy as np
import yaml
from IPython.core.display import display, HTML

In [15]:
# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(config_path, config_name), 'r') as file:
        config = yaml.safe_load(file)

    return config

config_path = "conf/base"
# load yaml catalog configuration file
config = load_config("catalog.yml")

os.chdir(config["project_path"])
root_path = os.getcwd()

### Functions to initialize data ingestion

In [16]:
def raw_file_checker(files): 

    keyword = ['RC', 'Vocab_2', 'Vocab_1']
    checker = []
    file_exists = {}
    for fname in files:
        for key in keyword:
            if key in fname:
                checker.append(True)
                file_exists[key] = os.path.join(fname)
                
    if len(checker) == 3 :
        print("PASS: All files exists!")
        condition = True
    else:
        print("FAIL: Not all file exists! Please check the raw data folder to ensure RC, Vocab_1 and Vocab_2 file exists.")
        condition = False
        
    return condition, file_exists


def data_ingestion_initialize(root_path):
    
    # Function to load yaml configuration file
    def load_config(config_name):
        with open(os.path.join(config_path, config_name), 'r') as file:
            config = yaml.safe_load(file)

        return config

    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    print("Initialize data ingestion and file checking...")
    
    # define input and output data paths
    raw_data_path = os.path.join(root_path, config["data_path"]["input"])
    out_data_path = os.path.join(root_path, config["data_path"]["output"])
    
    # define reference file paths
    ref_path = os.path.join(root_path, config["data_path"]["ref"])
    ref_filepath = os.path.join(ref_path, config["filenames"]["rc_col_ref"])
    ref_data = pd.read_excel(io = ref_filepath, sheet_name="columns_check", header=None)
    ref_data_cols = ref_data[0].tolist()
    
    # get the list of files in raw folder
    files = os.listdir(raw_data_path)
    files = [f for f in files if f[-4:] == '.xls']
    
    condition, file_exists = raw_file_checker(files)
    
    ## Define raw data filepaths
    rc_filepath = os.path.join(raw_data_path, file_exists['RC'])
    v1_filepath = os.path.join(raw_data_path, file_exists['Vocab_1'])
    v2_filepath = os.path.join(raw_data_path, file_exists['Vocab_2'])
       
    return raw_data_path, out_data_path, ref_path, ref_filepath, ref_data, ref_data_cols, files, file_exists, rc_filepath, v1_filepath, v2_filepath

raw_data_path, out_data_path, ref_path, ref_filepath, ref_data, ref_data_cols, files, file_exists, rc_filepath, v1_filepath, v2_filepath = data_ingestion_initialize(root_path)

Initialize data ingestion and file checking...
PASS: All files exists!


### Function to create dataframes

In [17]:
def create_dataframes(file_initial, rc_filepath, v1_filepath , v2_filepath):
    
    '''
    file_initial choices -
    RC: Reading Comprehension 
    Vocab_1: Vocabulary 1 
    Vocab_2: Vocabulary 2
    '''
    
    if file_initial == 'RC':
        filepath = rc_filepath
    elif file_initial == 'Vocab_1':
        filepath = v1_filepath
    elif file_initial == 'Vocab_2':
        filepath = v2_filepath
    
    # create dataframe from 'Summary' sheet
    df_summary = pd.read_excel(io = filepath, sheet_name="Summary")
    df_summary_cols = list(df_summary.columns)
    
    # create dataframe from 'Data' sheet
    df_data = pd.read_excel(io=filepath, sheet_name="Data")
    df_data_cols = list(df_data.columns)
    
    # create dataframe from 'Data' sheet
    df_ans_key = pd.read_excel(io=filepath, sheet_name="Answer Key")
    df_ans_key_cols = list(df_ans_key.columns)
    
    print(f"Dataframe created from {file_initial} file")
    
    return df_summary, df_summary_cols, df_data, df_data_cols, df_ans_key, df_ans_key_cols


### Data integrity scanning functions

In [18]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

def print_scan_results(col_condition_num, scan_num, file_initial , sheets = 'Summary'):
    
    if scan_num == 1:
        print(f"\nSCAN-{scan_num} : {file_initial} - {sheets} : Checking if the sheet contains either 'Language' and 'Market' columns ...")
        if col_condition_num == True:
            print(color.GREEN + "PASS" + color.END + ": 'Summary' sheet contains both 'Language' and 'Market' columns")
        else: 
            print(color.RED + "FAIL" + color.END + ": 'Summary' sheet does not contain either 'Language' and 'Market' columns")

    if scan_num == 2:
        print(f"\nSCAN-{scan_num} : {file_initial} - {sheets} : Checking if Language' and 'Market' columns are empty ...")
        if col_condition_num == True:
            print(color.GREEN + "PASS" + color.END + ": Both 'Language' and 'Market' columns in 'Summary' contains complete data")
        else: 
            print(color.RED + "FAIL" + color.END + ": Both or either 'Language' and 'Market' columns in 'Summary' sheet are empty or incomplete")
            
    if scan_num == 3 or scan_num == 6:
        print(f"\nSCAN-{scan_num} : {file_initial} - {sheets} : Checking if '_worker_id' column name is correct ...")
        if col_condition_num == True:
            print(color.GREEN + "PASS" + color.END + ": valid '_workder_id' column name")
        else:
            print(color.RED + "FAIL" + color.END + ": invalid '_workder_id' column name")
            
    if scan_num == 4:         
        print(f"\nSCAN-{scan_num} : {file_initial} - {sheets} : Checking if sheet contains 'Language' column ...")
        if col_condition_num == True:
            print(color.GREEN + "PASS" + color.END + ": 'Data' sheet contains 'Language' columns")
        else: 
            print(color.RED + "FAIL" + color.END + ": 'Data' sheet does not contain 'Language' columns")
            
    if scan_num == 5:         
        print(f"\nSCAN-{scan_num} : {file_initial} - {sheets} : Checking if Language' column are empty ...")
        if col_condition_num == True:
            print(color.GREEN + "PASS" + color.END + ": 'Language'column in 'Data' contains complete data")
        else: 
            print(color.RED + "FAIL" + color.END + ": 'Language' column in 'Data' sheet are empty or incomplete")
            
    if scan_num == 7 and file_initial == 'RC':         
        print(f"\nSCAN-7 : {file_initial} - {sheets} : checking if columns in the 'Data' sheet are identical to the reference columns ...")
        if col_condition_num == True:
            print (color.GREEN + "PASS" + color.END + ": The columns in the 'Data' sheet are identical to the reference") 
        else: 
            print (color.RED + "FAIL" + color.END + ": The columns in the 'Data' sheet are not identical to the reference")         
            
def summary_col_check(df_summary, df_summary_cols, file_initial , sheets = 'Summary'): 
      
    # --- SCAN-1 : checking if "Summary" sheet contains "Language" and "Market" columns   ---------------------
    # PASS -> 'Summary' sheet contains both 'Language' and 'Market' columns
    scan_num = 1
    cols_to_check = ['Language', 'Market']
    col_checker = {}   
    for col in cols_to_check:
        
        if col in df_summary_cols:
            col_checker[col] = True
        else:
            col_checker[col] = False
            
    condition_1 = col_checker['Language']
    condition_2 = col_checker['Market']
    col_condition_1 = all([condition_1, condition_2]) # both conditions has to be true

    return col_condition_1, scan_num

def summary_col_value_check(df_summary, file_initial, sheets = 'Summary'): 
    
    # --- SCAN-2 :checking if "Language" and "Market" columns in "Summary" is empty   -------------------------
    # PASS -> Both 'Language' and 'Market' columns in 'Summary' contains complete data
    scan_num = 2
    cols_to_check = ['Language', 'Market']
    col_checker = {}
    for col in cols_to_check:

        if df_summary[col].notnull().values.all() == True:
            col_checker[col] = True
        else:
            col_checker[col] = False

    condition_3 = col_checker['Language']
    condition_4 = col_checker['Market']
    col_condition_2 = all([condition_3, condition_4]) # both conditions has to be true

    return col_condition_2, scan_num

def col_header_check(df_summary_data, file_initial, sheets):
    
    # --- SCAN-3 : checking if worker_id column contains _ at the start   -------------------------------------
    # PASS -> if the number of character is 10 not 9 and column name is _workder_id
    scan_num = 3
    find_worker_idx = df_summary_data.columns.str.contains('worker')
    worker_idx = [i for i, x in enumerate(find_worker_idx) if x][0]
    worker_col = df_summary_data.columns[worker_idx]
    worker_col_len = len(worker_col)
    
    if worker_col_len == 10 and worker_col[0] == "_":
        col_condition_3 = True
    elif worker_col_len == 9 and worker_col[0] == "w":
        col_condition_3 = False
    return col_condition_3, scan_num

def data_col_check(df_data, df_data_cols, file_initial, sheets = 'Data'): 
      
    # --- SCAN-4 : checking if "Data" sheet contains "Language" column   --------------------------------------
    # PASS -> 'Data' sheet contains both 'Language' column
    scan_num = 4
    cols_to_check = ['Language']
    col_checker = {}   
    for col in cols_to_check:
        
        if col in df_data_cols:
            col_checker[col] = True
        else:
            col_checker[col] = False
            
    condition_1 = col_checker['Language']
    col_condition_4 = all([condition_1])
        
    return col_condition_4, scan_num

def data_col_value_check(df_data, file_initial, sheets = 'Data'): 
    
    # --- SCAN-5 :checking if "Language" column in "Data" is empty   -------------------------
    # PASS -> 'Language' column in 'Data' contains complete data
    scan_num = 5
    cols_to_check = ['Language']
    col_checker = {}
    for col in cols_to_check:

        if df_data[col].notnull().values.all() == True:
            col_checker[col] = True
        else:
            col_checker[col] = False

    condition_3 = col_checker['Language']
    col_condition_5 = all([condition_3])

    return col_condition_5, scan_num

def data_col_header_check(df_data_cols, ref_data_cols, file_initial, sheets = 'Data'):
    
    # --- SCAN-7 : checking if columns in "Data" sheet are identical to the reference columns   ------------------------
    # refer to the file in reference > reference_checks.xlsx
    # PASS -> if the two column lists are identical
    scan_num = 7
    ref_data_cols_sorted = ref_data_cols
    df_data_cols_sorted = df_data_cols
    
    # sorting both the lists 
    ref_data_cols_sorted.sort() 
    df_data_cols_sorted.sort() 
    
    # using == to check if  
    if ref_data_cols_sorted == df_data_cols_sorted:
        col_condition_7 = True
    else : 
        col_condition_7 = False
    return col_condition_7, scan_num

def data_integrity_check(df_summary, df_summary_cols, df_data, df_data_cols, file_initial): 
    
    print(color.BOLD + f"Reading {file_initial} raw data and perform data integrity scanning...:\n" + color.END)
      
    conditions_list = []
    
    # SCAN-1
    col_condition_1, scan_num = summary_col_check(df_summary, df_summary_cols, file_initial , 'Summary')
    print_scan_results(col_condition_1, scan_num, file_initial , sheets = 'Summary')
    conditions_list.append(col_condition_1)
      
    # SCAN-2
    # Runs only when col_condition_1 returns True
    if col_condition_1 == True:
        col_condition_2, scan_num = summary_col_value_check(df_summary, file_initial, 'Summary')  
        print_scan_results(col_condition_2, scan_num, file_initial , sheets = 'Summary')
        conditions_list.append(col_condition_2)
    else:
        conditions_list = conditions_list

    # SCAN-3
    col_condition_3, scan_num = col_header_check(df_summary, file_initial, 'Summary')
    print_scan_results(col_condition_3, scan_num, file_initial , sheets = 'Summary')
    conditions_list.append(col_condition_3)
    
    # SCAN-4
    col_condition_4, scan_num = data_col_check(df_data, df_data_cols, file_initial, sheets = 'Data')
    print_scan_results(col_condition_4, scan_num, file_initial , sheets = 'Data')
    conditions_list.append(col_condition_4)
    
    # SCAN-5
    # Runs only when col_condition_4 returns True
    if col_condition_4 == True:
        col_condition_5, scan_num = data_col_value_check(df_data, file_initial, sheets = 'Data')
        print_scan_results(col_condition_5, scan_num, file_initial , sheets = 'Data')
        conditions_list.append(col_condition_5)
    else:
        conditions_list = conditions_list
        
    # SCAN-6 
    col_condition_6, scan_num = col_header_check(df_data, file_initial, 'Data')
    scan_num = 6
    print_scan_results(col_condition_6, scan_num, file_initial , 'Data')
    conditions_list.append(col_condition_6)
    
    # SCAN-7
    if file_initial == 'RC':
        col_condition_7, scan_num = data_col_header_check(df_data_cols, ref_data_cols, file_initial, sheets = 'Data')
        print_scan_results(col_condition_7, scan_num, file_initial , 'Data')
        conditions_list.append(col_condition_7)
       
    # Final data integrity results after all checks
    # PASS -> when all scans return True/PASS
    if len(conditions_list) > 1 :
        integrity_result = all(conditions_list)
        if integrity_result == True:
            print(color.BOLD + f'\n{file_initial} data integrity result:' + color.GREEN + ' PASS' + color.END + '\n')
        else: 
            print(color.BOLD + f'\n{file_initial} data integrity result:' + color.RED + ' FAIL' + color.END + '\n')
    elif len(conditions_list) == 1: 
        print(color.BOLD + f'\n{file_initial} data integrity result:' + color.RED + ' FAIL' + color.END + '\n')

    return integrity_result, conditions_list

### Data cleaning

#### Initializing data cleaning step function

In [23]:
def data_cleaning_initialize(file_exists):

    while True:
        L_M_input_response = input("Do you know the 'Language' and/or 'Market code' for this file? (y/n) : ").lower()
        if L_M_input_response == "y" or L_M_input_response == "yes":
            language = input("\nPlease enter the Language: ").capitalize()
            market = input("\nPlease enter the Market code: eg. EN-EN for English : ").upper()
            break
        elif L_M_input_response == "n" or L_M_input_response == "no":
            prefill_response = input("\nWould you like a suggestion for Language (extracted from filename)? (y/n) : ").lower()
            if prefill_response == "y" or prefill_response == "yes":

                #filename = file_exists['RC']
                language = file_exists['RC'].split("_RC")[0]

                language_suggest = input(f"\nThe suggested language is : {language} . Do you accept this suggestion? (y/n) : ").lower()

                if language_suggest == "y" or language_suggest == "yes":
                    language = language
                    prefill_response_m = input("\nWould you like a default prefill for Market (XX-XX). This will only serve as a temporary value, \
please change this as soon as the actual value is known ? (y/n) : ").lower()
                    if prefill_response_m == "y" or prefill_response_m == "yes":  
                        market = "XX-XX"
                        break
                    elif prefill_response_m == "n" or prefill_response_m == "no":   
                        print("\nPlease find out the Language and Market code before proceeding - automated data cleaning will NOT be performed. \n")
                        language, market = '',''
                        break
                    break
                elif language_suggest == "n" or language_suggest == "no":
                    print("\nPlease find out the Language and Market code before proceeding - automated data cleaning will NOT be performed. \n")
                    language, market = '',''
                    break

                break
            elif prefill_response == "n" or prefill_response == "no":     
                print("\nPlease find out the Language and Market code before proceeding - automated data cleaning will NOT be performed. \n")
                language, market = '',''
                break
            break
        else:
            print("\nPlease enter either 'y' or 'n' only!")
            
    return language, market
        

#### Deploy data cleaning step function

In [24]:
def data_cleaning_deploy(file_initials, language, market):
    
    # check that language and market is not empty
    if language != '' and market != '':

        # check that language input is consistent with the filename
        file_language = file_exists['RC'].split("_RC")[0]
        if language == file_language :

            print('Starting automated data cleaning....')

            for file_initial in file_initials:

                clean_data_all(file_initial, language, market)

            print("Automated data cleaning completed. Cleaned excel files are located in data>processed folder. \n")

        else:
            print('\nWARNING: Language input is inconsistent with the filename!\n')
            print(f'Input Language: {language}')
            print(f'File Language: {file_language}')
            lang_check = input("\nWould you like to default the language name as per the filename? (y/n) : ").lower()
            if lang_check == "y" or lang_check == "yes":
                language, market = file_language, 'XX-XX'
                print(f'\nLanguage has been set to: {file_language}')
                print(f'Temporary market code has been set to: {market}\n')

                print('Starting automated data cleaning....')

                for file_initial in file_initials:

                    clean_data_all(file_initial, language, market)

                print("Automated data cleaning completed. Cleaned excel files are located in data>processed folder. \n")

            elif lang_check == "n" or lang_check == "no":
                print('Automated data processing will not run due to language inconsistency. Please try again.')
    else:

        print('The values for Language and Market must be known before initializing automated data cleaning!')
    

#### Functions for data cleaning

In [25]:
def display_df_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))
    
    
def clean_summary_sheet(df_summary, df_summary_cols, file_initial, language, market):
    
    # SCAN-1 - only run if condition is False / FAIL
    col_condition_1, scan_num = summary_col_check(df_summary, df_summary_cols, file_initial , 'Summary')
    
    if col_condition_1 == False:

        print("Language and Market columns and values inserted to 'Summary' sheet")
        # make a copy of df_summary
        df_summary_cleaned = df_summary

        # insert "Language" and "Market" columns into "Summary" sheet
        # values are defined in the input arguments
        df_summary_cleaned.insert(0, 'Language', language)
        df_summary_cleaned.insert(1, 'Market', market)
        df_summary_cleaned_cols = list(df_summary_cleaned.columns)
        
        
        #rescan SCAN-1 and SCAN-2
        summary_col_check(df_summary, df_summary_cols, file_initial , 'Summary')
        summary_col_value_check(df_summary, file_initial, sheets = 'Summary')

    else:
        
        df_summary_cleaned = df_summary
    
    # SCAN-3 - only run if condition is False / FAIL
    col_condition_3, scan_num = col_header_check(df_summary_cleaned, file_initial, 'Summary')
    
    if col_condition_3 == False:
        
        print("Column name worker_id replaced with _worker_id")
        find_worker_idx = df_summary_cleaned.columns.str.contains('worker')
        worker_idx = [i for i, x in enumerate(find_worker_idx) if x][0]
        worker_col = df_summary_cleaned.columns[worker_idx]
        
        # replacing column name worker_id with _worker_id
        df_summary_cleaned = df_summary_cleaned.rename(columns={worker_col: "_worker_id"})
        
        #rescan SCAN-3
        col_header_check(df_summary_cleaned, file_initial, 'Summary')
        
    return df_summary_cleaned

def clean_RC_data_sheet_columns(df_data_cleaned, ref_data):
    
    df_data_col_add = df_data_cleaned
    
    ref_data = pd.read_excel(io = ref_filepath, sheet_name="columns_check", header=None)
    ref_data_cols = ref_data[0].tolist()
    
    # enumerate ref_data cols to get the location in list
    ref_data_cols_enum = enumerate(ref_data_cols , start=0)
    ref_data_cols_enum = list(ref_data_cols_enum)
    
    df_data_cols = list(df_data_cleaned.columns)
    
    # find the index and values of missing columns in df_data_column against the ref_data. 
    #difference_list = [x for x in ref_data_cols if x not in set(df_data_cols)]
    difference_list = []
    for x in range(len(ref_data_cols_enum)):
        idx = ref_data_cols_enum[x][0]
        value_to_check = ref_data_cols_enum[x][1]
        if value_to_check not in set(df_data_cols):
            difference_list.append((idx, value_to_check))
    
    # insert the missing columns into the df_data dataframe
    idxs, col_names = zip(*difference_list)
    for idx, col_name in zip(idxs, col_names) :
        df_data_col_add.insert(idx, col_name, np.nan)

    return df_data_col_add

def clean_V_data_sheet_columns(df_data_cleaned):
    
    df_data_col_rem = df_data_cleaned
    selected_cols = df_data_col_rem.columns.tolist()[2:5]
    
    null_test_results = []
    for i in range(len(selected_cols)):
        col_iteration = df_data_col_rem[selected_cols[i]]
        if col_iteration.notnull().values.all() == True:
            null_test = False
            null_test_results.append((selected_cols[i], null_test))
            df_data_col_rem = df_data_col_rem.rename(columns={selected_cols[i]: "rater_answer"})
        else:
            null_test = True
            null_test_results.append((selected_cols[i], null_test))
            df_data_col_rem.drop(selected_cols[i], axis = 1, inplace = True)
    
    return df_data_col_rem
     
def clean_data_sheet(df_data, df_data_cols, file_initial, language, market):

    # SCAN-4 - only run if condition is False / FAIL
    col_condition_4, scan_num = data_col_check(df_data, df_data_cols, file_initial, sheets = 'Data')
        
    if col_condition_4 == False:
        
        print("Language column and values inserted to 'Data' sheet")
        # make a copy of df_data
        df_data_cleaned = df_data
        
        # insert "Language" columns into "Data" sheet
        df_data_cleaned.insert(0, 'Language', language)
        df_data_cleaned_cols = list(df_data_cleaned.columns)
        
        #rescan SCAN-4 and SCAN-5
        data_col_check(df_data, df_data_cols, file_initial, sheets = 'Data')
        data_col_value_check(df_data, file_initial, sheets = 'Data')
        
    else:
        
        df_data_cleaned = df_data
        
    # SCAN-6 - only run if condition is False / FAIL
    col_condition_6, scan_num = col_header_check(df_data_cleaned, file_initial, 'Data')
    scan_num = 6
    
    if col_condition_6 == False:
        
        print("Column name worker_id replaced with _worker_id")
        find_worker_idx_2 = df_data_cleaned.columns.str.contains('worker')
        worker_idx_2 = [i for i, x in enumerate(find_worker_idx_2) if x][0]
        worker_col_2 = df_data_cleaned.columns[worker_idx_2]
        
        # replacing column name worker_id with _worker_id
        df_data_cleaned = df_data_cleaned.rename(columns={worker_col_2: "_worker_id"})
        
        #rescan SCAN-6
        col_header_check(df_data_cleaned, file_initial, 'Data')
           
    if file_initial == 'RC':      
        
        # SCAN-7 - only run if condition is False / FAIL
        col_condition_7, scan_num = data_col_header_check(df_data_cols, ref_data_cols, file_initial, sheets = 'Data')
    
        if col_condition_7 == False:

            print("Missing columns inserted into 'Data' sheet.")
            df_data_cleaned = clean_RC_data_sheet_columns(df_data_cleaned, ref_data_cols)
            df_data_cleaned_cols = list(df_data_cleaned.columns)

            # rescan SCAN-7
            data_col_header_check(df_data_cols, ref_data_cols, file_initial, sheets = 'Data')
            
    if file_initial == 'Vocab_1' or file_initial == 'Vocab_2':
        
        print(f"Removing unwanted columns from {file_initial} Data sheet")
        df_data_cleaned = clean_V_data_sheet_columns(df_data_cleaned)
    
    return df_data_cleaned

def write_to_excel(file_initial, out_data_path, file_exists, df_summary_cleaned, df_data_cleaned, df_ans_key, encode=None):

    if file_initial == 'RC' :
        with pd.ExcelWriter(os.path.join(out_data_path, file_exists['RC'].split('.')[0] + '.xlsx')) as writer:  
            df_summary_cleaned.to_excel(writer, sheet_name='Summary', index=False, encoding=encode)
            df_data_cleaned.to_excel(writer, sheet_name='Data', index=False, encoding=encode)
            df_ans_key.to_excel(writer, sheet_name='Answer Key', index=False, encoding=encode)

    if file_initial == 'Vocab_1' :
        with pd.ExcelWriter(os.path.join(out_data_path, file_exists['Vocab_1'].split('.')[0] + '.xlsx')) as writer:  
            df_summary_cleaned.to_excel(writer, sheet_name='Summary', index=False, encoding=encode)
            df_data_cleaned.to_excel(writer, sheet_name='Data', index=False, encoding=encode)
            df_ans_key.to_excel(writer, sheet_name='Answer Key', index=False, encoding=encode)

    if file_initial == 'Vocab_2' :
        with pd.ExcelWriter(os.path.join(out_data_path, file_exists['Vocab_2'].split('.')[0] + '.xlsx')) as writer:  
            df_summary_cleaned.to_excel(writer, sheet_name='Summary', index=False, encoding=encode)
            df_data_cleaned.to_excel(writer, sheet_name='Data', index=False, encoding=encode)
            df_ans_key.to_excel(writer, sheet_name='Answer Key', index=False, encoding=encode)

def clean_data(file_initial, language, market): 
    
    df_summary, df_summary_cols, df_data, df_data_cols, df_ans_key, df_ans_key_cols = create_dataframes(file_initial, rc_filepath, v1_filepath , v2_filepath)
    
    # Clean Summary sheet
    df_summary_cleaned = clean_summary_sheet(df_summary, df_summary_cols, file_initial, language, market)
    df_summary_cleaned_cols = list(df_summary_cleaned.columns)
           
    # Clean Data sheet
    df_data_cleaned = clean_data_sheet(df_data, df_data_cols, file_initial, language, market)
    df_data_cleaned_cols = list(df_data_cleaned.columns)

    print('\nPreview cleaned datasets:\n')
    dfs = [df_summary_cleaned.iloc[:,:7].head(), df_data_cleaned.iloc[:,:7].head()]
    captions = ['df_summary_cleaned', 'df_data_cleaned']
    display_df_side_by_side(dfs, captions)
    
    # Get cleaned datasets integrity report
    print('\nData integrity report post clean-up:\n')
    data_integrity_check(df_summary_cleaned, df_summary_cleaned_cols, df_data_cleaned, df_data_cleaned_cols, file_initial)
        
    return df_summary_cleaned, df_data_cleaned, df_ans_key

#file_initials = ['RC', 'Vocab_1', 'Vocab_2']

def clean_data_all(file_initial, language, market):

    df_summary_cleaned, df_data_cleaned, df_ans_key  = clean_data(file_initial, language, market)
    write_to_excel(file_initial, out_data_path, file_exists, df_summary_cleaned, df_data_cleaned, df_ans_key, encode=None)

### Data Cleaning Run All

In [26]:
def main():

    file_initials = ['RC', 'Vocab_1', 'Vocab_2']
    
    language, market = data_cleaning_initialize(file_exists)
    data_cleaning_deploy(file_initials, language, market)


if __name__ == "__main__":

    main()

Do you know the 'Language' and/or 'Market code' for this file? (y/n) :  y

Please enter the Language:  Indonesian

Please enter the Market code: eg. EN-EN for English :  Indonesian


Starting automated data cleaning....
Dataframe created from RC file
Language and Market columns and values inserted to 'Summary' sheet
Language column and values inserted to 'Data' sheet
Missing columns inserted into 'Data' sheet.

Preview cleaned datasets:



Unnamed: 0,Language,Market,_worker_id,Score,Percentage,Grouping
0,Indonesian,INDONESIAN,45360260,9,0.75,Pilot 2
1,Indonesian,INDONESIAN,45361251,9,0.75,Pilot 2
2,Indonesian,INDONESIAN,45496367,7,0.583333,Pilot 2
3,Indonesian,INDONESIAN,45496590,7,0.583333,Pilot 2
4,Indonesian,INDONESIAN,45630496,5,0.416667,Pilot 2

Unnamed: 0,Language,_id,question_no_1,question_no_2,question_no_3,question_no_4,question_no_5
0,Indonesian,5868431995,a,c,b,,
1,Indonesian,5868432549,a,c,a,,
2,Indonesian,5868445272,a,c,b,,
3,Indonesian,5868445718,a,c,a,,
4,Indonesian,5868446482,a,b,a,,



Data integrity report post clean-up:

[1mReading RC raw data and perform data integrity scanning...:
[0m

SCAN-1 : RC - Summary : Checking if the sheet contains either 'Language' and 'Market' columns ...
[92mPASS[0m: 'Summary' sheet contains both 'Language' and 'Market' columns

SCAN-2 : RC - Summary : Checking if Language' and 'Market' columns are empty ...
[92mPASS[0m: Both 'Language' and 'Market' columns in 'Summary' contains complete data

SCAN-3 : RC - Summary : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-4 : RC - Data : Checking if sheet contains 'Language' column ...
[92mPASS[0m: 'Data' sheet contains 'Language' columns

SCAN-5 : RC - Data : Checking if Language' column are empty ...
[92mPASS[0m: 'Language'column in 'Data' contains complete data

SCAN-6 : RC - Data : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-7 : RC - Data : checking if columns in

Unnamed: 0,Language,Market,_worker_id,Score,Percentage,Grouping
0,Indonesian,INDONESIAN,45360260,18,0.9,Pilot 2
1,Indonesian,INDONESIAN,45361251,17,0.85,Pilot 2
2,Indonesian,INDONESIAN,45496367,18,0.9,Pilot 2
3,Indonesian,INDONESIAN,45496590,17,0.85,Pilot 2
4,Indonesian,INDONESIAN,45630496,19,0.95,Pilot 2

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register
0,Indonesian,5868416492,no,demeanor/attitude,neutral,demeanor/attitude,neutral
1,Indonesian,5868418444,no,demeanor/attitude,neutral,demeanor/attitude,neutral
2,Indonesian,5868418802,no,demeanor/attitude,neutral,demeanor/attitude,neutral
3,Indonesian,5868418927,no,demeanor/attitude,neutral,demeanor/attitude,neutral
4,Indonesian,5868420074,yes,demeanor/attitude,neutral,demeanor/attitude,neutral



Data integrity report post clean-up:

[1mReading Vocab_1 raw data and perform data integrity scanning...:
[0m

SCAN-1 : Vocab_1 - Summary : Checking if the sheet contains either 'Language' and 'Market' columns ...
[92mPASS[0m: 'Summary' sheet contains both 'Language' and 'Market' columns

SCAN-2 : Vocab_1 - Summary : Checking if Language' and 'Market' columns are empty ...
[92mPASS[0m: Both 'Language' and 'Market' columns in 'Summary' contains complete data

SCAN-3 : Vocab_1 - Summary : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-4 : Vocab_1 - Data : Checking if sheet contains 'Language' column ...
[92mPASS[0m: 'Data' sheet contains 'Language' columns

SCAN-5 : Vocab_1 - Data : Checking if Language' column are empty ...
[92mPASS[0m: 'Language'column in 'Data' contains complete data

SCAN-6 : Vocab_1 - Data : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name
[1m
Voca

Unnamed: 0,Language,Market,_worker_id,Score,Percentage,Grouping
0,Indonesian,INDONESIAN,45360260,30,0.75,Pilot 2
1,Indonesian,INDONESIAN,45361251,32,0.8,Pilot 2
2,Indonesian,INDONESIAN,45496367,30,0.75,Pilot 2
3,Indonesian,INDONESIAN,45496590,25,0.625,Pilot 2
4,Indonesian,INDONESIAN,45630496,34,0.85,Pilot 2

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register
0,Indonesian,5868419633,a_and_b_have_the_same_meaning,personal,slang/informal,personal,slang/informal
1,Indonesian,5868421716,a_and_b_have_the_same_meaning,personal,slang/informal,personal,slang/informal
2,Indonesian,5868425189,a_and_b_have_the_same_meaning,personal,slang/informal,personal,slang/informal
3,Indonesian,5868425760,a_and_b_have_the_same_meaning,personal,slang/informal,personal,slang/informal
4,Indonesian,5868427602,a_and_b_have_the_same_meaning,personal,slang/informal,personal,slang/informal



Data integrity report post clean-up:

[1mReading Vocab_2 raw data and perform data integrity scanning...:
[0m

SCAN-1 : Vocab_2 - Summary : Checking if the sheet contains either 'Language' and 'Market' columns ...
[92mPASS[0m: 'Summary' sheet contains both 'Language' and 'Market' columns

SCAN-2 : Vocab_2 - Summary : Checking if Language' and 'Market' columns are empty ...
[92mPASS[0m: Both 'Language' and 'Market' columns in 'Summary' contains complete data

SCAN-3 : Vocab_2 - Summary : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-4 : Vocab_2 - Data : Checking if sheet contains 'Language' column ...
[92mPASS[0m: 'Data' sheet contains 'Language' columns

SCAN-5 : Vocab_2 - Data : Checking if Language' column are empty ...
[92mPASS[0m: 'Language'column in 'Data' contains complete data

SCAN-6 : Vocab_2 - Data : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name
[1m
Voca