In [None]:
%reset

###################################################################       
#Script Name    :                                                                                              
#Description    :                                                                                 
#Args           :                                                                                           
#Author         : Nikhil Rao in R, converted to Python by Nor Raymond                                              
#Email          : nraymond@appen.com                                          
###################################################################

In [6]:
import os
import glob 
import pandas as pd
import numpy as np
import yaml
from IPython.core.display import display, HTML

In [7]:
# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(config_path, config_name), 'r') as file:
        config = yaml.safe_load(file)

    return config

config_path = "conf/base"

try:
    
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
except:
    
    os.chdir('..')
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()

### Functions to initialize data ingestion

In [21]:
def group_files_by_language(data_path, files, file_initials):
    
    file_groups = {}  
    for x in files:  
        key = x.split('_')[0] #x[:16] # The key is the first 16 characters of the file name
        group = file_groups.get(key,[])
        group.append(x)  
        file_groups[key] = group
                
    return file_groups

def create_file_exists_df(files, file_initials):
    
    checker = []
    file_exists = []
    for fname in files:
        for key in file_initials:
            if key in fname:
                file_exists.append((key, fname))

    file_exists = pd.DataFrame(file_exists, columns =['Keyword', 'Filename'])
    
    return file_exists

def data_ingestion_initialize(root_path, run_value):
    
    # Function to load yaml configuration file
    def load_config(config_name):
        with open(os.path.join(config_path, config_name), 'r') as file:
            config = yaml.safe_load(file)

        return config

    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    print("Initialize data ingestion and file checking...\n")
    
    if run_value == 'Deployment':
        
        # define data input paths
        data_path = os.path.join(root_path, config["data_path"]["output"], 'Deployment')
        
    else:
        
        # define data input paths
        data_path = os.path.join(root_path, config["data_path"]["output"], run_value)
        
       
    # get the list of files in raw folder
    files = os.listdir(data_path)
    files = [f for f in files if f[-5:] == '.xlsx']
    
    file_initials = ['RC', 'Vocab_1', 'Vocab_2']

    languages = []
    for file in files:
        for file_initial in file_initials:        
            lang = file.split('_' + file_initial)[0]
        if not lang.endswith((".xlsx")):
            languages.append(lang)
    
    languages = pd.DataFrame(languages, columns = ['Language'])
    
    file_groups = group_files_by_language(data_path, files, file_initials)
    
    file_exists = create_file_exists_df(files, file_initials)
    
    return data_path, files, languages, file_groups, file_exists
       

### Functions for data processing

In [16]:
file_initials = ['RC', 'Vocab_1', 'Vocab_2']

def obtain_file_summary_df(file_initials, file_exists, data_path):
    
    df_summary = []
    for k in file_initials:
        selected_files = file_exists[file_exists['Keyword'] == k] 
        selected_filenames = selected_files['Filename'].tolist()

        df = pd.DataFrame()
        for f in selected_filenames:
            data = pd.read_excel(os.path.join(data_path, f), 'Summary')
            df = df.append(data)

        df_summary.append(df)    
        
    return df_summary

def obtain_file_data_df(file_initials, file_exists, data_path):
    
    df_data = []
    for k in file_initials:
        selected_files = file_exists[file_exists['Keyword'] == k] 
        selected_filenames = selected_files['Filename'].tolist()

        df = pd.DataFrame()
        for f in selected_filenames:
            data = pd.read_excel(os.path.join(data_path, f), 'Data')
            df = df.append(data)

        df_data.append(df)    
        
    return df_data

def obtain_distinct_raters(df_summary):

    r1 = df_summary[0] # Joined data for Summary sheet from RC 
    r2 = df_summary[1] # Joined data for Summary page from Vocab_1 
    r3 = df_summary[2] # Joined data for Summary page from Vocab_2 
    
    raters = pd.concat([r1,r2,r3], ignore_index=True)
    raters = raters[['_worker_id', 'Grouping', 'Market', 'Language']]
    raters = raters.drop_duplicates()
    
    # obtain languages from r1 and create a dataframe
    languages = r1.Language.unique().tolist()
    languages = pd.DataFrame(languages, columns = ['Language'])
    
    return raters, r1, r2, r3, languages

def merge_raters_to_df_data(df_data, raters):

    rc = df_data[0] # Joined data for Data sheet from RC 
    v1 = df_data[1] # Joined data for Data page from Vocab_1 
    v2 = df_data[2] # Joined data for Data page from Vocab_2 
    
    # Merge raters to v1, v2, and rc
    rc = pd.merge(rc, raters,  how='left', on=['_worker_id', 'Language'])
    v1 = pd.merge(v1, raters,  how='left', on=['_worker_id', 'Language'])
    v2 = pd.merge(v2, raters,  how='left', on=['_worker_id', 'Language'])
    
    # Convert _created_at and _started_at to date-time
    rc[['_created_at','_started_at']] = rc[['_created_at','_started_at']].apply(pd.to_datetime, format='%m/%d/%Y %H:%M:%S')
    v1[['_created_at','_started_at']] = v1[['_created_at','_started_at']].apply(pd.to_datetime, format='%m/%d/%Y %H:%M:%S')
    v2[['_created_at','_started_at']] = v2[['_created_at','_started_at']].apply(pd.to_datetime, format='%m/%d/%Y %H:%M:%S')

    return rc, v1, v2


In [18]:
def run_selection():

    run_value = str(input("\nPlease input the type of run eg. Deployment, Pilot 1, Pilot 2A ...etc.: "))
    print(f"\nRun type: {run_value}\n")
    
    return run_value

In [33]:
def main():

    file_initials = ['RC', 'Vocab_1', 'Vocab_2']
    run_value = run_selection()
    
    data_path, files, languages, file_groups, file_exists = data_ingestion_initialize(root_path, run_value)
    
    df_summary = obtain_file_summary_df(file_initials, file_exists, data_path)
    df_data = obtain_file_data_df(file_initials, file_exists, data_path)
    raters, r1, r2, r3, languages =  obtain_distinct_raters(df_summary)
    rc, v1, v2 = merge_raters_to_df_data(df_data, raters)
    
    return raters, r1, r2, r3, languages, rc, v1, v2, run_value

if __name__ == "__main__":

    raters, r1, r2, r3, languages, rc, v1, v2, run_value = main()
    print(languages)
    print('\nAutomated data processing completed.')


Please input the type of run eg. Deployment, Pilot 1, Pilot 2A ...etc.:  Deployment



Run type: Deployment

Initialize data ingestion and file checking...

              Language
0              Russian
1               Hebrew
2           Indonesian
3  Chinese(Simplified)

Automated data processing completed.
