In [None]:
%reset

###################################################################       
#Script Name    :                                                                                              
#Description    :                                                                                 
#Args           :                                                                                           
#Author         : Nor Raymond                                                
#Email          : nraymond@appen.com                                          
###################################################################

In [38]:
import os
import pandas as pd
import numpy as np
import yaml
from IPython.core.display import display, HTML

# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(config_path, config_name), 'r') as file:
        config = yaml.safe_load(file)

    return config

config_path = "conf/base"

try:
    
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
except:
    
    os.chdir('..')
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()

In [39]:
#import functions from data_integrity_scanner module
from src.data.data_integrity_scanner import print_scan_results, summary_col_check, summary_col_value_check, col_header_check
from src.data.data_integrity_scanner import data_col_check, data_col_value_check, data_col_header_check
from src.data.data_integrity_scanner import data_integrity_check

# #import functions from data_ingestion module
from src.data.data_ingestion import data_ingestion_initialize, raw_file_checker, create_dataframes

### Data cleaning

#### Initializing data cleaning step function

In [40]:
def data_cleaning_initialize(file_exists):

    while True:
        L_M_input_response = input("\nDo you know the 'Language' and/or 'Market code' for this file? (y/n) : ").lower()
        
        if L_M_input_response == "y" or L_M_input_response == "yes":
            language = input("\nPlease enter the Language: ") #.capitalize()
            market = input("\nPlease enter the Market code: eg. EN-EN for English : ").upper()
            break
            
        elif L_M_input_response == "n" or L_M_input_response == "no":
            prefill_response = input("\nWould you like a suggestion for Language (extracted from filename)? (y/n) : ").lower()
            
            if prefill_response == "y" or prefill_response == "yes":

                #filename = file_exists['RC']
                language = file_exists['RC'].split("_RC")[0]

                language_suggest = input(f"\nThe suggested language is : {language} . Do you accept this suggestion? (y/n) : ").lower()

                if language_suggest == "y" or language_suggest == "yes":
                    language = language
                    prefill_response_m = input("\nWould you like a default prefill for Market (XX-XX). This will only serve as a temporary value, \
please change this as soon as the actual value is known ? (y/n) : ").lower()
                    
                    if prefill_response_m == "y" or prefill_response_m == "yes":  
                        market = "XX-XX"
                        break
                    elif prefill_response_m == "n" or prefill_response_m == "no":   
                        print("\nPlease find out the Language and Market code before proceeding - automated data cleaning will NOT be performed. \n")
                        language, market = '',''
                        break
                        
                    break
                    
                elif language_suggest == "n" or language_suggest == "no":
                    print("\nPlease find out the Language and Market code before proceeding - automated data cleaning will NOT be performed. \n")
                    language, market = '',''
                    break

                break
                
            elif prefill_response == "n" or prefill_response == "no":     
                print("\nPlease find out the Language and Market code before proceeding - automated data cleaning will NOT be performed. \n")
                language, market = '',''
                break
                
            break
            
        else:
            print("\nPlease enter either 'y' or 'n' only!")
            
    return language, market
        

#### Deploy data cleaning step function

In [41]:
def data_cleaning_deploy(file_initials, file_exists, rc_filepath, v1_filepath , v2_filepath, language, market, ref_data_cols, ref_filepath, out_data_path, run_value, run_value_2):
    
    file_initials = ['RC', 'Vocab_1', 'Vocab_2']
    
    # check that language and market is not empty
    if language != '' and market != '':

        # check that language input is consistent with the filename
        file_language = file_exists['RC'].split("_RC")[0]
        if language == file_language :

            print('\nStarting automated data cleaning....\n')

            for file_initial in file_initials:

                clean_data_all(file_initial, rc_filepath, v1_filepath , v2_filepath, language, market, ref_data_cols, ref_filepath, 
                               out_data_path, file_exists, run_value, run_value_2)

            print(f"Automated data cleaning completed. Cleaned excel files are located in data > processed > {run_value} folder. \n")

        else:
            
            print('\nWARNING: Language input is inconsistent with the filename!\n')
            print(f'Input Language: {language}')
            print(f'File Language: {file_language}')
            
            lang_check = input("\nWould you like to default the language name as per the filename? (y/n) : ").lower()
            
            if lang_check == "y" or lang_check == "yes":
                language, market = file_language, 'XX-XX'
                print(f'\nLanguage has been set to: {file_language}')
                print(f'Temporary market code has been set to: {market}\n')

                print('\nStarting automated data cleaning....\n')

                for file_initial in file_initials:

                    clean_data_all(file_initial, rc_filepath, v1_filepath , v2_filepath, language, market, ref_data_cols, ref_filepath, 
                                   out_data_path, file_exists, run_value, run_value_2)

                print(f"Automated data cleaning completed. Cleaned excel files are located in data > processed > {run_value} folder. \n")

            elif lang_check == "n" or lang_check == "no":
                
                lang_assert = input(f"\nAre you sure you want to keep the language as {language}? (y/n) : ").lower()
                
                if lang_assert == "y" or lang_assert == "yes":
                    
                    language, market = language, 'XX-XX'
                    print(f'\nLanguage has been set to: {language}')
                    print(f'Temporary market code has been set to: {market}\n')
                    
                    print('\nStarting automated data cleaning....\n')

                    for file_initial in file_initials:

                        clean_data_all(file_initial, rc_filepath, v1_filepath , v2_filepath, language, market, ref_data_cols, ref_filepath, 
                                       out_data_path, file_exists, run_value, run_value_2)

                    print(f"Automated data cleaning completed. Cleaned excel files are located in data > processed > {run_value} folder. \n")
                
                elif lang_assert == "n" or lang_assert == "no":
                    print('Automated data processing will not run due to language inconsistency. Please try again.')
    else:

        print('The values for Language and Market must be known before initializing automated data cleaning!')

def run_selection():

    run_value = str(input("\nPlease input the type of run e.g. Deployment, Pilot 1, Pilot 2, Pilot 3 .... etc.: "))
    print(f"\nRun type: {run_value}")
    
    if run_value != 'Deployment':
    
        run_value_2 = str(input("\nPlease input the pilot subfolder name e.g. Pilot 1A, Pilot 2C, Pilot 3A-B .... etc.: "))
        print(f"\nPilot subfolder: {run_value_2}")
    
    else:
        
        run_value_2 = ''
        
    return run_value, run_value_2

#### Functions for data cleaning

In [42]:
def display_df_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))
    
    
def clean_summary_sheet(df_summary, df_summary_cols, file_initial, language, market):
    
    # SCAN-1 - only run if condition is False / FAIL
    col_condition_1, scan_num = summary_col_check(df_summary, df_summary_cols, file_initial , 'Summary')
        
    if col_condition_1 == False:

        print("Language and Market columns and values inserted to 'Summary' sheet")
        # make a copy of df_summary
        df_summary_cleaned = df_summary

        # insert "Language" and "Market" columns into "Summary" sheet
        # values are defined in the input arguments
        
        if 'Language' not in df_summary_cols:
            
            df_summary_cleaned.insert(0, 'Language', language)
        
        if 'Market' not in df_summary_cols:
        
            df_summary_cleaned.insert(1, 'Market', market)
            
        df_summary_cleaned_cols = list(df_summary_cleaned.columns)
        
        #rescan SCAN-1 and SCAN-2
        summary_col_check(df_summary, df_summary_cols, file_initial , 'Summary')
        summary_col_value_check(df_summary, file_initial, sheets = 'Summary')

    else:
        
        df_summary_cleaned = df_summary
    
    # SCAN-3 - only run if condition is False / FAIL
    col_condition_3, scan_num = col_header_check(df_summary_cleaned, file_initial, 'Summary')
    
    if col_condition_3 == False:
        
        print("Column name worker_id replaced with _worker_id")
        find_worker_idx = df_summary_cleaned.columns.str.contains('worker')
        worker_idx = [i for i, x in enumerate(find_worker_idx) if x][0]
        worker_col = df_summary_cleaned.columns[worker_idx]
        
        # replacing column name worker_id with _worker_id
        df_summary_cleaned = df_summary_cleaned.rename(columns={worker_col: "_worker_id"})
        
        #rescan SCAN-3
        col_header_check(df_summary_cleaned, file_initial, 'Summary')
        
    return df_summary_cleaned

def clean_RC_data_sheet_columns(df_data_cleaned, ref_data, ref_filepath):
    
    df_data_col_add = df_data_cleaned
    
    ref_data = pd.read_excel(io = ref_filepath, sheet_name="columns_check", header=None)
    ref_data_cols = ref_data[0].tolist()
    
    # enumerate ref_data cols to get the location in list
    ref_data_cols_enum = enumerate(ref_data_cols , start=0)
    ref_data_cols_enum = list(ref_data_cols_enum)
    
    df_data_cols = list(df_data_cleaned.columns)
    
    # find the index and values of missing columns in df_data_column against the ref_data. 
    #difference_list = [x for x in ref_data_cols if x not in set(df_data_cols)]
    difference_list = []
    for x in range(len(ref_data_cols_enum)):
        idx = ref_data_cols_enum[x][0]
        value_to_check = ref_data_cols_enum[x][1]
        if value_to_check not in set(df_data_cols):
            difference_list.append((idx, value_to_check))
    
    # insert the missing columns into the df_data dataframe
    idxs, col_names = zip(*difference_list)
    for idx, col_name in zip(idxs, col_names) :
        df_data_col_add.insert(idx, col_name, np.nan)

    return df_data_col_add

def clean_V_data_sheet_columns(df_data_cleaned):
    
    df_data_col_rem = df_data_cleaned
    selected_cols = df_data_col_rem.columns.tolist()[2:5]
    
    null_test_results = []
    for i in range(len(selected_cols)):
        col_iteration = df_data_col_rem[selected_cols[i]]
        if col_iteration.notnull().values.all() == True:
            null_test = False
            null_test_results.append((selected_cols[i], null_test))
            df_data_col_rem = df_data_col_rem.rename(columns={selected_cols[i]: "rater_answer"})
        else:
            null_test = True
            null_test_results.append((selected_cols[i], null_test))
            df_data_col_rem.drop(selected_cols[i], axis = 1, inplace = True)
    
    return df_data_col_rem
     
def clean_data_sheet(df_data, df_data_cols, file_initial, language, market, ref_data_cols, ref_filepath):

    # SCAN-4 - only run if condition is False / FAIL
    col_condition_4, scan_num = data_col_check(df_data, df_data_cols, file_initial, sheets = 'Data')
        
    if col_condition_4 == False:
        
        print("Language column and values inserted to 'Data' sheet")
        # make a copy of df_data
        df_data_cleaned = df_data
        
        # insert "Language" columns into "Data" sheet
        df_data_cleaned.insert(0, 'Language', language)
        df_data_cleaned_cols = list(df_data_cleaned.columns)
        
        #rescan SCAN-4 and SCAN-5
        data_col_check(df_data, df_data_cols, file_initial, sheets = 'Data')
        data_col_value_check(df_data, file_initial, sheets = 'Data')
        
    else:
        
        df_data_cleaned = df_data
        
    # SCAN-6 - only run if condition is False / FAIL
    col_condition_6, scan_num = col_header_check(df_data_cleaned, file_initial, 'Data')
    scan_num = 6
    
    if col_condition_6 == False:
        
        print("Column name worker_id replaced with _worker_id")
        find_worker_idx_2 = df_data_cleaned.columns.str.contains('worker')
        worker_idx_2 = [i for i, x in enumerate(find_worker_idx_2) if x][0]
        worker_col_2 = df_data_cleaned.columns[worker_idx_2]
        
        # replacing column name worker_id with _worker_id
        df_data_cleaned = df_data_cleaned.rename(columns={worker_col_2: "_worker_id"})
        
        #rescan SCAN-6
        col_header_check(df_data_cleaned, file_initial, 'Data')
           
    if file_initial == 'RC':      
        
        # SCAN-7 - only run if condition is False / FAIL
        col_condition_7, scan_num = data_col_header_check(df_data_cols, ref_data_cols, file_initial, sheets = 'Data')
    
        if col_condition_7 == False:

            print("Missing columns inserted into 'Data' sheet.")
            df_data_cleaned = clean_RC_data_sheet_columns(df_data_cleaned, ref_data_cols, ref_filepath)
            df_data_cleaned_cols = list(df_data_cleaned.columns)

            # rescan SCAN-7
            data_col_header_check(df_data_cols, ref_data_cols, file_initial, sheets = 'Data')
            
    if file_initial == 'Vocab_1' or file_initial == 'Vocab_2':
        
        print(f"Removing unwanted columns from {file_initial} Data sheet")
        df_data_cleaned = clean_V_data_sheet_columns(df_data_cleaned)
    
    return df_data_cleaned

def file_check_create(out_data_path, run_value, run_value_2):
    
    if run_value == 'Deployment':      
        
        out_folder = os.path.join(out_data_path, 'Deployment')
        
        if not os.path.exists(out_folder):
            os.makedirs(out_folder, exist_ok=True)   
    else:      
        
        out_folder = os.path.join(out_data_path, run_value, run_value_2)
        
        if not os.path.exists(out_folder):
            os.makedirs(out_folder, exist_ok=True)        
        
    return out_folder

def write_to_excel(file_initial, out_folder, file_exists, df_summary_cleaned, df_data_cleaned, df_ans_key, encode=None):

    if file_initial == 'RC' :
        with pd.ExcelWriter(os.path.join(out_folder, file_exists['RC'].split('.')[0] + '.xlsx')) as writer:  
            df_summary_cleaned.to_excel(writer, sheet_name='Summary', index=False, encoding=encode)
            df_data_cleaned.to_excel(writer, sheet_name='Data', index=False, encoding=encode)
            df_ans_key.to_excel(writer, sheet_name='Answer Key', index=False, encoding=encode)

    if file_initial == 'Vocab_1' :
        with pd.ExcelWriter(os.path.join(out_folder, file_exists['Vocab_1'].split('.')[0] + '.xlsx')) as writer:  
            df_summary_cleaned.to_excel(writer, sheet_name='Summary', index=False, encoding=encode)
            df_data_cleaned.to_excel(writer, sheet_name='Data', index=False, encoding=encode)
            df_ans_key.to_excel(writer, sheet_name='Answer Key', index=False, encoding=encode)

    if file_initial == 'Vocab_2' :
        with pd.ExcelWriter(os.path.join(out_folder, file_exists['Vocab_2'].split('.')[0] + '.xlsx')) as writer:  
            df_summary_cleaned.to_excel(writer, sheet_name='Summary', index=False, encoding=encode)
            df_data_cleaned.to_excel(writer, sheet_name='Data', index=False, encoding=encode)
            df_ans_key.to_excel(writer, sheet_name='Answer Key', index=False, encoding=encode)

def clean_data(file_initial, rc_filepath, v1_filepath , v2_filepath, language, market, ref_data_cols, ref_filepath): 
    
    df_catalog = create_dataframes(file_initial, rc_filepath, v1_filepath , v2_filepath)
    df_ans_key = df_catalog["df_ans_key"]
    df_ans_key_cols = df_catalog["df_ans_key_cols"]
    
    # Clean Summary sheet
    df_summary_cleaned = clean_summary_sheet(df_catalog["df_summary"], df_catalog["df_summary_cols"], file_initial, language, market)
    df_summary_cleaned_cols = list(df_summary_cleaned.columns)
           
    # Clean Data sheet
    df_data_cleaned = clean_data_sheet(df_catalog["df_data"], df_catalog["df_data_cols"], file_initial, language, market, ref_data_cols, ref_filepath)
    df_data_cleaned_cols = list(df_data_cleaned.columns)

    print('\nPreview cleaned datasets:\n')
    dfs = [df_summary_cleaned.iloc[:,:7].head(), df_data_cleaned.iloc[:,:7].head()]
    captions = ['df_summary_cleaned', 'df_data_cleaned']
    display_df_side_by_side(dfs, captions)
    
    # Get cleaned datasets integrity report
    print('\nData integrity report post clean-up:\n')
    data_integrity_check(df_summary_cleaned, df_summary_cleaned_cols, df_data_cleaned, df_data_cleaned_cols, file_initial, ref_data_cols)
        
    return df_summary_cleaned, df_data_cleaned, df_ans_key

def clean_data_all(file_initial, rc_filepath, v1_filepath , v2_filepath, language, market, ref_data_cols, ref_filepath, out_data_path, file_exists, run_value, run_value_2):

    df_summary_cleaned, df_data_cleaned, df_ans_key  = clean_data(file_initial, rc_filepath, v1_filepath , v2_filepath, language, market, ref_data_cols, ref_filepath)
    out_folder = file_check_create(out_data_path, run_value, run_value_2)
    write_to_excel(file_initial, out_folder, file_exists, df_summary_cleaned, df_data_cleaned, df_ans_key , encode=None)

### Data Cleaning Run All

In [43]:
def main():

    # initialize data ingestion and obtain the data catalog dictionary (all variables)
    print("Initialize data ingestion and file checking...")
    
    try:
        
        file_initials = ['RC', 'Vocab_1', 'Vocab_2']

        data_catalog = data_ingestion_initialize(root_path, config_path)

        raw_data_path = data_catalog["raw_data_path"]
        out_data_path = data_catalog["out_data_path"]
        ref_path = data_catalog["ref_path"]
        ref_filepath = data_catalog["ref_filepath"]
        ref_data = data_catalog["ref_data"]
        ref_data_cols = data_catalog["ref_data_cols"]
        files = data_catalog["files"]
        file_exists = data_catalog["file_exists"]
        rc_filepath = data_catalog["rc_filepath"]
        v1_filepath = data_catalog["v1_filepath"]
        v2_filepath = data_catalog["v2_filepath"]
        condition = data_catalog["condition"]
        file_exists = data_catalog["file_exists"]
        message = data_catalog["message"]
        print("\n" + message)

        run_value, run_value_2 = run_selection()
        language, market = data_cleaning_initialize(file_exists)
        data_cleaning_deploy(file_initials, file_exists, rc_filepath, v1_filepath , v2_filepath, language, market, 
                             ref_data_cols, ref_filepath, out_data_path, run_value, run_value_2)
       
    except: 
        
        data_catalog = data_ingestion_initialize(root_path, config_path)
        message = data_catalog["message"]
        condition = data_catalog["condition"]
        print(message)
        
        language, market, run_value, run_value_2 = '','','',''
    
    return language, market, run_value, run_value_2 

if __name__ == "__main__":

    language, market, run_value, run_value_2  = main()

Initialize data ingestion and file checking...

PASS: All files exists!



Please input the type of run e.g. Deployment, Pilot 1, Pilot 2, Pilot 3 .... etc.:  Pilot 3



Run type: Pilot 3



Please input the pilot subfolder name e.g. Pilot 1A, Pilot 2C, Pilot 3A-B .... etc.:  Pilot 3A



Pilot subfolder: Pilot 3A



Do you know the 'Language' and/or 'Market code' for this file? (y/n) :  y

Please enter the Language:  Chinese-Traditional

Please enter the Market code: eg. EN-EN for English :  ZH-ZH



Starting automated data cleaning....

Dataframe created from RC file
Language and Market columns and values inserted to 'Summary' sheet
Language column and values inserted to 'Data' sheet
Missing columns inserted into 'Data' sheet.

Preview cleaned datasets:



Unnamed: 0,Language,Market,_worker_id,Score,Percentage,Grouping
0,Chinese-Traditional,ZH-ZH,45488857,19,0.791667,Pilot 3A
1,Chinese-Traditional,ZH-ZH,45492033,21,0.875,Pilot 3A
2,Chinese-Traditional,ZH-ZH,45492267,21,0.875,Pilot 3A
3,Chinese-Traditional,ZH-ZH,45492778,20,0.833333,Pilot 3A
4,Chinese-Traditional,ZH-ZH,45493055,19,0.791667,Pilot 3A

Unnamed: 0,Language,_id,question_no_1,question_no_2,question_no_3,question_no_4,question_no_5
0,Chinese-Traditional,5868398595.0,a,c,a,,
1,Chinese-Traditional,5868407750.0,a,c,a,,
2,Chinese-Traditional,5868455061.0,c,c,a,,
3,Chinese-Traditional,5868504123.0,a,c,a,,
4,Chinese-Traditional,5868596340.0,c,c,a,,



Data integrity report post clean-up:

[1mReading RC raw data and perform data integrity scanning...:
[0m

SCAN-1 : RC - Summary : Checking if the sheet contains either 'Language' and 'Market' columns ...
[92mPASS[0m: 'Summary' sheet contains both 'Language' and 'Market' columns

SCAN-2 : RC - Summary : Checking if Language' and 'Market' columns are empty ...
[92mPASS[0m: Both 'Language' and 'Market' columns in 'Summary' contains complete data

SCAN-3 : RC - Summary : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-4 : RC - Data : Checking if sheet contains 'Language' column ...
[92mPASS[0m: 'Data' sheet contains 'Language' columns

SCAN-5 : RC - Data : Checking if Language' column are empty ...
[92mPASS[0m: 'Language'column in 'Data' contains complete data

SCAN-6 : RC - Data : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-7 : RC - Data : checking if columns in

Unnamed: 0,Language,Market,_worker_id,Score,Percentage,Grouping
0,Chinese-Traditional,ZH-ZH,45488857,28,0.7,Pilot 3A
1,Chinese-Traditional,ZH-ZH,45492033,33,0.825,Pilot 3A
2,Chinese-Traditional,ZH-ZH,45492267,30,0.75,Pilot 3A
3,Chinese-Traditional,ZH-ZH,45492778,34,0.85,Pilot 3A
4,Chinese-Traditional,ZH-ZH,45493055,35,0.875,Pilot 3A

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register
0,Chinese-Traditional,5868341514,yes,people,slang/informal,people,slang/informal
1,Chinese-Traditional,5868371216,yes,people,slang/informal,people,slang/informal
2,Chinese-Traditional,5868388043,no,people,slang/informal,people,slang/informal
3,Chinese-Traditional,5868430242,no,people,slang/informal,people,slang/informal
4,Chinese-Traditional,5868447667,yes,people,slang/informal,people,slang/informal



Data integrity report post clean-up:

[1mReading Vocab_1 raw data and perform data integrity scanning...:
[0m

SCAN-1 : Vocab_1 - Summary : Checking if the sheet contains either 'Language' and 'Market' columns ...
[92mPASS[0m: 'Summary' sheet contains both 'Language' and 'Market' columns

SCAN-2 : Vocab_1 - Summary : Checking if Language' and 'Market' columns are empty ...
[92mPASS[0m: Both 'Language' and 'Market' columns in 'Summary' contains complete data

SCAN-3 : Vocab_1 - Summary : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-4 : Vocab_1 - Data : Checking if sheet contains 'Language' column ...
[92mPASS[0m: 'Data' sheet contains 'Language' columns

SCAN-5 : Vocab_1 - Data : Checking if Language' column are empty ...
[92mPASS[0m: 'Language'column in 'Data' contains complete data

SCAN-6 : Vocab_1 - Data : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name
[1m
Voca

Unnamed: 0,Language,Market,_worker_id,Score,Percentage,Grouping
0,Chinese-Traditional,ZH-ZH,45488857,73,0.9125,Pilot 3A
1,Chinese-Traditional,ZH-ZH,45492033,68,0.85,Pilot 3A
2,Chinese-Traditional,ZH-ZH,45492267,71,0.8875,Pilot 3A
3,Chinese-Traditional,ZH-ZH,45492778,72,0.9,Pilot 3A
4,Chinese-Traditional,ZH-ZH,45493055,64,0.8,Pilot 3A

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register
0,Chinese-Traditional,5868361389.0,a_is_more_specific_than_b,geography,neutral,geography,technical
1,Chinese-Traditional,5868398402.0,a_is_more_specific_than_b,geography,neutral,geography,technical
2,Chinese-Traditional,5868443199.0,a_is_more_specific_than_b,geography,neutral,geography,technical
3,Chinese-Traditional,5868459848.0,a_is_more_specific_than_b,geography,neutral,geography,technical
4,Chinese-Traditional,5868511758.0,a_is_more_specific_than_b,geography,neutral,geography,technical



Data integrity report post clean-up:

[1mReading Vocab_2 raw data and perform data integrity scanning...:
[0m

SCAN-1 : Vocab_2 - Summary : Checking if the sheet contains either 'Language' and 'Market' columns ...
[92mPASS[0m: 'Summary' sheet contains both 'Language' and 'Market' columns

SCAN-2 : Vocab_2 - Summary : Checking if Language' and 'Market' columns are empty ...
[92mPASS[0m: Both 'Language' and 'Market' columns in 'Summary' contains complete data

SCAN-3 : Vocab_2 - Summary : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name

SCAN-4 : Vocab_2 - Data : Checking if sheet contains 'Language' column ...
[92mPASS[0m: 'Data' sheet contains 'Language' columns

SCAN-5 : Vocab_2 - Data : Checking if Language' column are empty ...
[92mPASS[0m: 'Language'column in 'Data' contains complete data

SCAN-6 : Vocab_2 - Data : Checking if '_worker_id' column name is correct ...
[92mPASS[0m: valid '_workder_id' column name
[1m
Voca