###################################################################       
#Script Name    :                                                                                              
#Description    :                                                                                 
#Args           :                                                                                           
#Author         : Nikhil Rao in R, converted to Python by Nor Raymond                                              
#Email          : nraymond@appen.com                                          
###################################################################

### Fail Rate Reports for Pilot

In [None]:
import os
import glob 
import pandas as pd
import numpy as np
import yaml
import warnings
from functools import reduce
warnings.filterwarnings("ignore")

In [None]:
# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(config_path, config_name), 'r') as file:
        config = yaml.safe_load(file)

    return config

config_path = "conf/base"

try:
    
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
except:
    
    os.chdir('..')
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
# import data_processing module
import src.data.data_processing as data_processing
# import data_processing module
import src.data.data_cleaning as data_cleaning

In [None]:
def language_selection(languages):

    while True:
        try:
            language_index = int(input("\nPlease select the number of the Language you are assessing: "))
            if language_index < min(languages.index) or language_index > max(languages.index):
                print(f"\nYou must enter numbers between {min(languages.index)} - {max(languages.index)}... Please try again")
                continue
            elif language_index == "":
                print("\nYou must enter any numbers")
                continue
            else:
                print(f"\nYou have selected {language_index} for {languages.iloc[language_index, 0]}")
                language_selected = languages.iloc[language_index, 0]
                break

        except ValueError:
            print(f"\nYou must enter numerical values only... Please try again")
            continue
        else:
            break
            
    return language_selected


#### Functions for Language Modification - getting the overall time taken

In [None]:
# function for Language Modification
def get_time_taken(df, language_selected):

    # Filter data based on selected language
    dfr = df[df['Language'] == language_selected]

    # Time Taken by Item
    dfr["Time_Taken_Seconds"] = (dfr['_created_at'] - dfr['_started_at']).dt.seconds

    # Time Taken Overall
    dfr_grouped = dfr.groupby('_worker_id').sum('Time_Taken_Seconds')
    dfr_grouped["Time_Taken_Minutes_Overall"] = dfr_grouped["Time_Taken_Seconds"] / 60
    dfr_grouped = dfr_grouped.reset_index()
    dfr = pd.merge(dfr, dfr_grouped[["Time_Taken_Minutes_Overall", "_worker_id"]], how = 'left', on = '_worker_id')

    return dfr

def get_time_taken_all(language_selected, rc, v1, v2):
    
    df_list = [rc, v1, v2]
    keys = ["rcR", "v1R", "v2R"]
    df_time = {}
    
    for df, key in zip(df_list, keys) :

        dfr = get_time_taken(df, language_selected)
        df_time[key] = dfr

    rcR, v1R, v2R = df_time["rcR"], df_time["v1R"], df_time["v2R"]    
    
    return rcR, v1R, v2R

#### Functions for pilot variant selectors

In [None]:
def report_1_selector(pilot_var_selected):
    
    if pilot_var_selected == 'Pilot 1A-1B':
    
        select1 = ['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby1 = ['Language', 'Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby2 = ['Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', 'Tenure', 'Fail_Rate'], [True, True, False]     
        
        selector_1 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}

    elif pilot_var_selected == 'Pilot 1C':
    
        select1 = ['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby1 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby2 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', 'Fail_Rate'], [True, False]
        
        selector_1 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}
        
    elif (pilot_var_selected == 'Pilot 1D' or pilot_var_selected == 'Pilot 1E' or pilot_var_selected == 'Pilot 1E(ES)' or 
          pilot_var_selected == 'Pilot 2A' or pilot_var_selected == 'Pilot 2B-A' or pilot_var_selected == 'Pilot 2D'):
    
        select1 = ['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby1 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby2 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', 'Fail_Rate'], [True, False]
        
        selector_1 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}
    
    elif pilot_var_selected == 'Pilot 3A':
    
        select1 = ['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby1 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby2 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', 'Fail_Rate'], [True, False]
        
        selector_1 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}
        
    return selector_1

In [None]:
def report_2_selector(pilot_var_selected):
    
    if pilot_var_selected == 'Pilot 1A-1B':
    
        # for v2_fail_rate
        select1 = ['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']

        groupby1 = ['Language', 'Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']
        
        groupby2 = ['Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', 'Tenure', 'Fail_Rate'], [True, True, False]
        
        # for v2_fail_rate_2
        select2 = ['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answer', 'Score']
        
        groupby3 = ['Language', 'Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answer', 'Score']
        
        groupby4 = ['Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 
                        'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values_2, sort_order_2 = ['Fluency', 'Tenure', '_unit_id','Fail_Rate'], [True, True ,True, False]
        
        drop_cols, explode, join_on, select3 = [],[],[],[]
        
        selector_2 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order,
              "select2" : select2, "groupby3" : groupby3, "groupby4" : groupby4, "sort_values_2" : sort_values_2, "sort_order_2" : sort_order_2, 
              "drop_cols" : drop_cols, "explode" : explode, "join_on" : join_on, "select3" : select3}
        
        
    elif pilot_var_selected == 'Pilot 1C':
        
        # for v2_fail_rate
        select1 = ['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']
                 
        groupby1 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']

        groupby2 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', '_unit_id'], [True, True]
                
        # for v2_fail_rate_2
        select2 = ['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']

        groupby3 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']
        
        groupby4 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                                          'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values_2, sort_order_2 = ['Fluency', '_unit_id', 'Score', 'Rate'], [True, True, True, False]
        
        drop_cols = ['Score', 'Count_of_Test_Takers', 'Total_Test_Takers']
        
        explode = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers']
        
        # for merge_v2_fail_rates
        join_on = ["Language", "Fluency", "_unit_id", "question_", "a_domain", "a_register", "wordphrase_a", "b_domain",
                                  "b_register", "wordphrase_b", "difficulty", "Answers"]
        
        select3 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a',
                'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Count_of_Test_Takers', 'Total_Test_Takers',
                'Overall_Fail_Rate', 'Answers', 'a_and_b_are_not_related', 'a_and_b_are_related', 'a_and_b_have_the_same_meaning',
                'a_is_more_specific_than_b', 'b_is_more_specific_than_a'] 
        
        selector_2 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order,
              "select2" : select2, "groupby3" : groupby3, "groupby4" : groupby4, "sort_values_2" : sort_values_2, "sort_order_2" : sort_order_2, 
              "drop_cols" : drop_cols, "explode" : explode, "join_on" : join_on, "select3" : select3}
        
    elif (pilot_var_selected == 'Pilot 1D' or pilot_var_selected == 'Pilot 1E' or pilot_var_selected == 'Pilot 1E(ES)' or 
          pilot_var_selected == 'Pilot 2A' or pilot_var_selected == 'Pilot 2B-A' or pilot_var_selected == 'Pilot 2D' or 
          pilot_var_selected == 'Pilot 3A'):
        
        # for v2_fail_rate
        select1 = ['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']
        
        groupby1 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']
        
        groupby2 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', '_unit_id'], [True, True]
                
        # for v2_fail_rate_2
        select2 = ['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']
        
        groupby3 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']
        
        groupby4 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                                          'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values_2, sort_order_2 = ['Fluency', '_unit_id', 'Score', 'Rate'], [True, True, True, False]
        
        drop_cols = ['Score', 'Count_of_Test_Takers', 'Total_Test_Takers']
        
        explode = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers']
        
        # for merge_v2_fail_rates
        join_on = ["Language", "Fluency", "_unit_id", "question_", "a_domain", "a_register", "wordphrase_a", "b_domain",
                                  "b_register", "wordphrase_b", "difficulty", "Answers"]
        
        select3 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a',
                'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Count_of_Test_Takers', 'Total_Test_Takers',
                'Overall_Fail_Rate', 'Answers', 'a_and_b_are_not_related', 'a_and_b_are_related', 'a_and_b_have_the_same_meaning',
                'a_is_more_specific_than_b', 'b_is_more_specific_than_a']
        
        selector_2 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order,
                      "select2" : select2, "groupby3" : groupby3, "groupby4" : groupby4, "sort_values_2" : sort_values_2, "sort_order_2" : sort_order_2, 
                      "drop_cols" : drop_cols, "explode" : explode, "join_on" : join_on, "select3" : select3}
        
    elif pilot_var_selected == 'Pilot 2B-A':
        
        # for v2_fail_rate
        select1 = ['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']
        
        groupby1 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']
        
        groupby2 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values, sort_order = ['Fluency', '_unit_id'], [True, True]
                
        # for v2_fail_rate_2
        select2 = ['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']
        
        groupby3 = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']
        
        groupby4 = ['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                                          'b_register', 'wordphrase_b', 'difficulty']
        
        sort_values_2, sort_order_2 = ['Fluency', '_unit_id', 'Score', 'Rate'], [True, True, True, False]
        
        drop_cols = ['Score', 'Count_of_Test_Takers', 'Total_Test_Takers']
        
        explode = ['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers']
        
        # for merge_v2_fail_rates
        join_on = ["Language", "Fluency", "_unit_id", "question_", "a_domain", "a_register", "wordphrase_a", "b_domain",
                                  "b_register", "wordphrase_b", "difficulty", "Answers"]
        
        selector_2 = {"select1" : select1, "groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order,
                      "select2" : select2, "groupby3" : groupby3, "groupby4" : groupby4, "sort_values_2" : sort_values_2, "sort_order_2" : sort_order_2, 
                      "drop_cols" : drop_cols, "explode" : explode, "join_on" : join_on, "select3" : []}
            
    return selector_2

In [None]:
def report_3_selector(pilot_var_selected):
    
    if pilot_var_selected == 'Pilot 1A-1B':
        
        # for rc_fail_rate_1A_1B
        select1 = ['Language', '_worker_id', '_country', 'Fluency', 'Tenure', 'Time_Taken_Seconds', '_unit_id', 'title',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity', 
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']
        
        drop_cols_1 = ['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4', 'Score']
        
        drop_cols_2 = ['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'a1', 'a2', 'a3', 'a4']
         
        explode =  ['Language', '_worker_id', '_country', 'Fluency', 'Tenure', 'Time_Taken_Seconds','_unit_id', 'title', 
                    'register', 'topic', 'text_type','complexity', 'familiarity']
        
        # for melt_rc_answer_actual_1A_1B
        select2 = ['Language', '_worker_id', '_country', 'Fluency', 'Tenure', 'Time_Taken_Seconds', '_unit_id', 'title',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity',
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']
        
        drop_cols_3 = ['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                            'a1', 'a2', 'a3', 'a4']
        
        explode2 = ['Language', '_worker_id', '_country', 'Fluency', 'Tenure', 'Time_Taken_Seconds',
                       '_unit_id', 'title', 'register', 'topic', 'text_type', 'complexity', 'familiarity']
        
        #for rc_q_s_pass_rate_1A_1B
        groupby1 = ['Language', 'Fluency', 'Tenure', '_unit_id', 'title', 'a', 'q', 'd', 'register', 'skill']
        
        groupby2 = ['Language', 'Fluency', 'Tenure', '_unit_id', 'title', 'q', 'd', 'register', 'skill']
        
        sort_values, sort_order = ['Fluency', 'Tenure', 'Fail_Rate'], [True, True, False]
        
        selector_3 = {"select1" : select1, "drop_cols_1" : drop_cols_1, "drop_cols_2" : drop_cols_2, "explode" : explode,
                      "select2" : select2, "select3" : [], "drop_cols_3" : drop_cols_3, "explode2" : explode2, "groupby1" : groupby1, 
                      "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}
        
    elif pilot_var_selected == 'Pilot 1C':
               
        # for rc_fail_rate
        select1 = ['Language', '_worker_id', '_country', 'Fluency', 'Tenure', 'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity', 
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']
        
        drop_cols_1 = ['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4', 'Score']
        
        drop_cols_2 = ['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'a1', 'a2', 'a3', 'a4']
        
        explode = ['Language', '_worker_id', '_country', 'Fluency', 'Tenure', 'Time_Taken_Seconds',
                   '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type', 'complexity', 'familiarity']
        
        # for melt_rc
        select2 = ['Language', '_unit_id', 'title', 'test_',
                'question_1_choice_1', 'question_1_choice_2', 'question_1_choice_3',
                'question_2_choice_1', 'question_2_choice_2', 'question_2_choice_3',
                'question_3_choice_1', 'question_3_choice_2', 'question_3_choice_3',
                'question_4_choice_1', 'question_4_choice_2', 'question_4_choice_3']
        
        #for melt_rc_answer_actual
        select3 = ['Language', '_worker_id', '_country', 'Fluency', 'Tenure' ,'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity',
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']
        
        drop_cols_3 = ['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                            'a1', 'a2', 'a3', 'a4']
        
        explode2 = ['Language', '_worker_id', '_country', 'Fluency', 'Tenure', 'Time_Taken_Seconds',
                    '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type','complexity', 'familiarity']
        
        #for rc_q_s_pass_rate
        groupby1 = ['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Score', 'Question', 'Difficulty', 'register', 'Skill']
        
        groupby2 = ['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Question', 'Difficulty', 'register', 'Skill']
        
        sort_values, sort_order = ['Fluency', 'Fail_Rate'], [True, False]
        
        selector_3 = {"select1" : select1, "drop_cols_1" : drop_cols_1, "drop_cols_2" : drop_cols_2, "explode" : explode,
                      "select2" : select2, "select3" : select3, "drop_cols_3" : drop_cols_3, "explode2" : explode2, "groupby1" : groupby1, 
                      "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}
        
    elif (pilot_var_selected == 'Pilot 1D' or pilot_var_selected == 'Pilot 1E' or pilot_var_selected == 'Pilot 1E(ES)' or 
          pilot_var_selected == 'Pilot 2A' or pilot_var_selected == 'Pilot 2B-A' or pilot_var_selected == 'Pilot 2D' or 
          pilot_var_selected == 'Pilot 3A'):
        
        # for rc_fail_rate
        select1 = ['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity', 
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']
        
        drop_cols_1 = ['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4', 'Score']
        
        drop_cols_2 = ['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'a1', 'a2', 'a3', 'a4']
        
        explode = ['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds',
                   '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type', 'complexity', 'familiarity']
        
        # for melt_rc
        select2 = ['Language', '_unit_id', 'title', 'test_',
                'question_1_choice_1', 'question_1_choice_2', 'question_1_choice_3',
                'question_2_choice_1', 'question_2_choice_2', 'question_2_choice_3',
                'question_3_choice_1', 'question_3_choice_2', 'question_3_choice_3',
                'question_4_choice_1', 'question_4_choice_2', 'question_4_choice_3']
        
        #for melt_rc_answer_actual
        select3 = ['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity',
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']
        
        drop_cols_3 = ['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                            'a1', 'a2', 'a3', 'a4']
        
        explode2 = ['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds',
                    '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type','complexity', 'familiarity']
        
        #for rc_q_s_pass_rate
        groupby1 = ['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Score', 'Question', 'Difficulty', 'register', 'Skill']
        
        groupby2 = ['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Question', 'Difficulty', 'register', 'Skill']
        
        sort_values, sort_order = ['Fluency', 'Fail_Rate'], [True, False]
        
        selector_3 = {"select1" : select1, "drop_cols_1" : drop_cols_1, "drop_cols_2" : drop_cols_2, "explode" : explode,
                      "select2" : select2, "select3" : select3, "drop_cols_3" : drop_cols_3, "explode2" : explode2, "groupby1" : groupby1, 
                      "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}
        
    return selector_3
        
        

In [None]:
def report_4_selector(pilot_var_selected):
    
    if pilot_var_selected == 'Pilot 1A-1B':
        
        groupby1 = ['Language', 'Fluency', 'Tenure', '_unit_id', 'title', 'aa', 'ra', 'a', 'q', 'd', 'register', 'skill']
        
        groupby2 = ['Language', 'Fluency', 'Tenure', '_unit_id', 'title', 'q', 'd', 'register', 'skill']
        
        sort_values, sort_order = ['Fluency', 'Tenure', '_unit_id', 'q', 'Fail_Rate'], [True, True, True, True, False]
        
        selector_4 = {"groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}        
            
    elif (pilot_var_selected == 'Pilot 1C' or pilot_var_selected == 'Pilot 1D' or pilot_var_selected == 'Pilot 1E' or pilot_var_selected == 'Pilot 1E(ES)' or 
          pilot_var_selected == 'Pilot 2A' or pilot_var_selected == 'Pilot 2B-A' or pilot_var_selected == 'Pilot 2D' or 
          pilot_var_selected == 'Pilot 3A'):
        
        groupby1 = ['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Actual_Answer', 'Rater_Answer', 
                                    'Score', 'Question', 'Difficulty', 'register', 'Skill']
        
        groupby2 = ['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Question', 'Difficulty', 'register', 'Skill']
        
        sort_values, sort_order = ['Fluency', '_unit_id', 'Question', 'Fail_Rate'], [True, True, True, False]
        
        selector_4 = {"groupby1" : groupby1, "groupby2" : groupby2, "sort_values" : sort_values, "sort_order" : sort_order}
        
        
    return selector_4
        

#### Functions for calculating Fail Rates

### PILOT 3A, 1C

In [None]:
####  ------------------------REPORT 1 : "Near Exact Match" - v1_actual_correct_by_question  ---------------------------------------------------------------

def v1_fail_rate(v1R, selector_1):  #Valid for Pilot 3A, 1C
    
    vR_temp = v1R[selector_1['select1']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(selector_1['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(selector_1['groupby2'], dropna =False)['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and Fail_rate descending 
    vR_grouped = vR_grouped.sort_values(selector_1['sort_values'], ascending = selector_1['sort_order'])
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def generate_report_1(v1R, selector_1):
    
    v1_actual_correct_by_question = v1_fail_rate(v1R, selector_1)
    
    return v1_actual_correct_by_question

####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 2 : "Close Match" - v2_fail_rates  ---------------------------------------------------------------------------------------

def v2_fail_rate(v2R, selector_2):
    
    vR_temp = v2R[selector_2['select1']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(selector_2['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(selector_2['groupby2'], dropna =False)['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Overall_Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_2['sort_values'], ascending = selector_2['sort_order'])
    
    # drop Score column
    vR_grouped = vR_grouped.drop('Score', axis = 1)
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def v2_fail_rate_2(v2R, selector_2, pilot_var_selected):
    
    vR_temp = v2R[selector_2['select2']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(selector_2['groupby3'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(selector_2['groupby4'], dropna =False)['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 . Note Pilot 1C,1D seem to have this disabled, but it's not making sense as Fail Rate should have Score = 0.
    if (pilot_var_selected == 'Pilot 1C' or pilot_var_selected == 'Pilot 1D' or pilot_var_selected == 'Pilot 1E' or pilot_var_selected == 'Pilot 1E(ES)'):
        vR_grouped = vR_grouped
        
    elif (pilot_var_selected == 'Pilot 3A' or pilot_var_selected == 'Pilot 2A' or 
          pilot_var_selected == 'Pilot 2B-A' or pilot_var_selected == 'Pilot 2D'):
        vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_2['sort_values_2'], ascending = selector_2['sort_order_2'])
    
    # drop Score columns
    vR_grouped = vR_grouped.drop(selector_2['drop_cols'], axis = 1)
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    vR_fail_rates[selector_2['explode']] = vR_fail_rates[selector_2['explode']].fillna('Null')
    
    vR_fail_rates  = pd.pivot_table(vR_fail_rates, 
                           index = selector_2['explode'],
                           values='Rate', columns=['rater_answer']).reset_index()
    vR_fail_rates.columns.name = None # remove name for columns
    
    vR_fail_rates[selector_2['explode']] = vR_fail_rates[selector_2['explode']].replace('Null', np.nan)
    
    # remove duplicate rows in the dataframe
    vR_fail_rates = vR_fail_rates.drop_duplicates()
    
    return vR_fail_rates 

def merge_v2_fail_rates(v2_actual_correct_by_question, v2_actual_correct_by_question_with_answer, selector_2, pilot_var_selected):
    
    v2_fail_rates = pd.merge(v2_actual_correct_by_question_with_answer, v2_actual_correct_by_question, how = 'left', 
                            on = selector_2['join_on'])
    
    if pilot_var_selected == 'Pilot 2B-A': # maybe add 2D here!
        v2_fail_rates = v2_fail_rates
    elif (pilot_var_selected == 'Pilot 1C' or pilot_var_selected == 'Pilot 1D' or pilot_var_selected == 'Pilot 1E' or pilot_var_selected == 'Pilot 1E(ES)' or 
          pilot_var_selected == 'Pilot 2A' or 
          pilot_var_selected == 'Pilot 3A'):
        v2_fail_rates = v2_fail_rates[selector_2['select3']]
    
    return v2_fail_rates

def generate_report_2(v2R, selector_2, pilot_var_selected):
    
    v2_actual_correct_by_question = v2_fail_rate(v2R, selector_2)

    v2_actual_correct_by_question_with_answer = v2_fail_rate_2(v2R, selector_2, pilot_var_selected)

    v2_fail_rates = merge_v2_fail_rates(v2_actual_correct_by_question, v2_actual_correct_by_question_with_answer, selector_2, pilot_var_selected)
    
    return v2_fail_rates
####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 3 : "Reading Comprehension" : rc_question_skill_pass_rate  ---------------------------------------------------------------

def rc_fail_rate(rcR, selector_3):

    vR_temp = rcR[selector_3['select1']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    cond2 = vR_temp['question_no_1'] == vR_temp['Answer_no_1']
    cond1 = (vR_temp['question_no_1'].isnull()) | (vR_temp['Answer_no_1'].isnull())
    vR_temp['a1'] = np.select([cond1, cond2], [np.nan, 1], 0)
    
    cond4 = vR_temp['question_no_2'] == vR_temp['Answer_no_2']
    cond3 = (vR_temp['question_no_2'].isnull()) | (vR_temp['Answer_no_2'].isnull())
    vR_temp['a2'] = np.select([cond3, cond4], [np.nan, 1], 0)
    
    cond6 = vR_temp['question_no_3'] == vR_temp['Answer_no_3']
    cond5 = (vR_temp['question_no_3'].isnull()) | (vR_temp['Answer_no_3'].isnull())
    vR_temp['a3'] = np.select([cond5, cond6], [np.nan, 1], 0)
    
    cond8 = vR_temp['question_no_4'] == vR_temp['Answer_no_4']
    cond7 = (vR_temp['question_no_4'].isnull()) | (vR_temp['Answer_no_4'].isnull())
    vR_temp['a4'] = np.select([cond7, cond8], [np.nan, 1], 0)
    
    # Dropping columns
    vR_temp = vR_temp.drop(selector_3['drop_cols_1'], axis =1)  
    
    # concatenate values from different columns with delimiter ;
    vR_temp['Score'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Question'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['Difficulty'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Google_Translate_Error'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    # Dropping more columns
    vR_temp = vR_temp.drop(selector_3['drop_cols_2'], axis =1)  
    
    # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(selector_3['explode']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['Score', 'Question', 'Difficulty', 'Google_Translate_Error', 'Skill']] = vR_temp[['Score', 'Question', 'Difficulty', 
                                                                                               'Google_Translate_Error', 'Skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['Score'])  # remove rows with NaN values in Score 
    vR_temp['Score'] = vR_temp['Score'].astype(float).astype('int') # set Score as integer
    
    rc_answer = vR_temp
    
    return rc_answer

## Melt RC and categorize question choice with letter and question number
def melt_rc_assign(rc_choices, q_list, choice_list):
    
    df=[]
    for ql in q_list:
        for cl in choice_list:
            df_temp_1 = rc_choices[rc_choices['variable'].str.contains('question_' + str(ql))]
            df_temp_2 = df_temp_1[df_temp_1['variable'].str.contains('choice_' + str(cl))]
            df_temp_2['Question'] = 'Question ' + str(ql)
            if cl == 1 :
                df_temp_2['Answer'] = 'a'
            elif cl == 2 :
                df_temp_2['Answer'] = 'b'
            elif cl == 3 :
                df_temp_2['Answer'] = 'c'
            df.append(df_temp_2)
            
    rc_choices = pd.concat(df)
    return rc_choices

## Melt RC and categorize question choice with letter and question number
def melt_rc(rcR, selector_3):

    vR_temp = rcR[selector_3['select2']]
    
    # remove duplicate rows in the dataframe
    vR_temp = vR_temp.drop_duplicates().reset_index(drop=True)
    
    vR_temp = pd.melt(vR_temp, id_vars=['Language', '_unit_id', 'title', 'test_'])
    
    rc_choices = vR_temp
    
    q_list, choice_list = [1,2,3,4], [1,2,3]
    rc_choices = melt_rc_assign(rc_choices, q_list, choice_list)
    rc_choices = rc_choices[['Language', '_unit_id', 'title', 'test_', 'Question', 'Answer', 'variable', 'value']]
    rc_choices = rc_choices.sort_values(['Language', 'title', 'test_', 'Question', 'Answer'])
    
    actual_answer = rc_choices
    rater_answer = rc_choices
    
    return rc_choices, actual_answer, rater_answer

# ## Melt RC into long format with actual answers
def melt_rc_answer_actual(rcR, selector_3):
    
    vR_temp = rcR[selector_3['select3']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    cond2 = vR_temp['question_no_1'] == vR_temp['Answer_no_1']
    cond1 = (vR_temp['question_no_1'].isnull()) | (vR_temp['Answer_no_1'].isnull())
    vR_temp['a1'] = np.select([cond1, cond2], [np.nan, 1], 0)
    
    cond4 = vR_temp['question_no_2'] == vR_temp['Answer_no_2']
    cond3 = (vR_temp['question_no_2'].isnull()) | (vR_temp['Answer_no_2'].isnull())
    vR_temp['a2'] = np.select([cond3, cond4], [np.nan, 1], 0)
    
    cond6 = vR_temp['question_no_3'] == vR_temp['Answer_no_3']
    cond5 = (vR_temp['question_no_3'].isnull()) | (vR_temp['Answer_no_3'].isnull())
    vR_temp['a3'] = np.select([cond5, cond6], [np.nan, 1], 0)
    
    cond8 = vR_temp['question_no_4'] == vR_temp['Answer_no_4']
    cond7 = (vR_temp['question_no_4'].isnull()) | (vR_temp['Answer_no_4'].isnull())
    vR_temp['a4'] = np.select([cond7, cond8], [np.nan, 1], 0)
    
    vR_temp = vR_temp.drop('Score', axis = 1)
    
    # concatenate values from different columns with delimiter ;
    vR_temp['Score'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Rater_Answer'] = vR_temp[['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4']].astype('str').agg(';'.join, axis=1)
    vR_temp['Actual_Answer'] = vR_temp[['Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Question'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['Difficulty'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Google_Translate_Error'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    vR_temp = vR_temp.drop(selector_3['drop_cols_3'], axis = 1)
    
     # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(selector_3['explode2']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['Score', 'Rater_Answer', 'Actual_Answer', 'Question', 'Difficulty', 'Google_Translate_Error', 'Skill']] = vR_temp[['Score', 'Rater_Answer', 
                                                                                                                                'Actual_Answer','Question', 
                                                                                                                                'Difficulty', 
                                                                                                                                'Google_Translate_Error', 
                                                                                                                                'Skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['Score'])  # remove rows with NaN values in Score 
    vR_temp['Score'] = vR_temp['Score'].astype(float).astype('int') # set Score as integer
    
    rc_answer_actual = vR_temp
    
    return rc_answer_actual

def rc_q_s_pass_rate(rc_answer, selector_3):
    
    # first grouping
    vR_grouped = rc_answer.groupby(selector_3['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count"})
    
    # second grouping
    vR_grouped['Total'] = vR_grouped.groupby(selector_3['groupby2'], dropna =False)['Count'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count'] / vR_grouped['Total']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_3['sort_values'], ascending = selector_3['sort_order'])
    vR_grouped = vR_grouped.reset_index(drop=True) #re-order df index
    
    rc_question_skill_pass_rate = vR_grouped
    
    return rc_question_skill_pass_rate

def generate_report_3(rcR, selector_3):
    
    rc_answer = rc_fail_rate(rcR, selector_3)
    
    rc_choices, actual_answer, rater_answer = melt_rc(rcR, selector_3)
    
    rc_answer_actual = melt_rc_answer_actual(rcR, selector_3)
    
    rc_question_skill_pass_rate = rc_q_s_pass_rate(rc_answer, selector_3)
    
    return rc_question_skill_pass_rate

####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 4 : "RC with Answers" : rc_question_skill_pass_rate_answer_final  --------------------------------------------------------

def rc_q_s_pass_rate_answer(rc_answer_actual, selector_4):
    
    # first grouping
    vR_grouped = rc_answer_actual.groupby(selector_4['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count"})
    
    # second grouping
    vR_grouped['Total'] = vR_grouped.groupby(selector_4['groupby2'], dropna =False)['Count'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count'] / vR_grouped['Total']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_4['sort_values'], ascending = selector_4['sort_order'])
    vR_grouped = vR_grouped.reset_index(drop=True) #re-order df index
    
    rc_question_skill_pass_rate_answer = vR_grouped
    
    return rc_question_skill_pass_rate_answer

def join_rc_q_s_pass_rate_answer(rc_question_skill_pass_rate_answer, actual_answer, rater_answer):
    
    first_join = rc_question_skill_pass_rate_answer
    first_join = pd.merge(first_join, actual_answer, how = 'left', 
                            left_on = ["Language", "_unit_id", "title" , "test_", "Question", "Actual_Answer"],
                            right_on = ["Language", "_unit_id", "title" , "test_", "Question", "Answer"])
    first_join = first_join.drop('Answer', axis=1)
    
    second_join = pd.merge(first_join, rater_answer, how = 'left', 
                            left_on = ["Language", "_unit_id", "title" , "test_", "Question", "Rater_Answer"],
                            right_on = ["Language", "_unit_id", "title" , "test_", "Question", "Answer"])
    second_join = second_join.drop('Answer', axis=1)
    
    second_join = second_join[['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Difficulty', 'register', 'Skill', 'Question',
                               'Actual_Answer', 'value_x', 'Rater_Answer', 'value_y', 'Count', 'Total', 'Fail_Rate']]
  
    second_join = second_join.rename(columns = { "Actual_Answer" : "Actual_Answer_Letter", 
                                       "value_x" : "Actual_Answer_Text",
                                       "Rater_Answer" : "Rater_Answer_Letter",
                                       "value_y" : "Rater_Answer_Text"})

    rc_question_skill_pass_rate_answer_final = second_join
    
    return rc_question_skill_pass_rate_answer_final


def generate_report_4(rcR, selector_3, selector_4):
    
    rc_choices, actual_answer, rater_answer = melt_rc(rcR, selector_3)
    
    rc_answer_actual = melt_rc_answer_actual(rcR, selector_3)
    
    rc_question_skill_pass_rate_answer = rc_q_s_pass_rate_answer(rc_answer_actual, selector_4)

    rc_question_skill_pass_rate_answer_final = join_rc_q_s_pass_rate_answer(rc_question_skill_pass_rate_answer, actual_answer, rater_answer)
    
    return rc_question_skill_pass_rate_answer_final

####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

### PILOT 1A-1B

In [None]:
####  ------------------------REPORT 1 : "Near Exact Match" - v1_actual_correct_by_question  ------------------------------------------------------------------

def v1_fail_rate_1A_1B(v1R, selector_1):  #Valid for Pilot 1A-1B
    
    vR_temp = v1R[selector_1['select1']]
       
    # first grouping
    vR_grouped = vR_temp.groupby(selector_1['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(selector_1['groupby2'], dropna =False)['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and Fail_rate descending 
    vR_grouped = vR_grouped.sort_values(selector_1['sort_values'], ascending=selector_1['sort_order'])
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def generate_report_1_1A_1B(v1R, selector_1):
    
    v1_actual_correct_by_question = v1_fail_rate_1A_1B(v1R, selector_1)
    
    return v1_actual_correct_by_question

####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 2 : "Close Match" - v2_fail_rates  ---------------------------------------------------------------------------------------

def v2_fail_rate_1A_1B(v2R, selector_2):
    
    vR_temp = v2R[selector_2['select1']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(selector_2['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(selector_2['groupby2'], dropna =False)['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_2['sort_values'], ascending = selector_2['sort_order'])
    
    # drop Score column
    vR_grouped = vR_grouped.drop('Score', axis = 1)
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def v2_fail_rate_2_1A_1B(v2R, selector_2):
    
    vR_temp = v2R[selector_2['select2']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(selector_2['groupby3'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(selector_2['groupby4'], dropna =False)['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_2['sort_values_2'], ascending = selector_2['sort_order_2'])
    
    v2_actual_correct_by_question_with_answer = vR_grouped
    
    return v2_actual_correct_by_question_with_answer

def generate_report_2_1A_1B(v2R, selector_2):
    
    v2_actual_correct_by_question = v2_fail_rate_1A_1B(v2R, selector_2)

    v2_actual_correct_by_question_with_answer = v2_fail_rate_2_1A_1B(v2R, selector_2)

    return v2_actual_correct_by_question_with_answer

####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 3 : "Reading Comprehension" : rc_question_skill_pass_rate  ---------------------------------------------------------------

def rc_fail_rate_1A_1B(rcR, selector_3):

    vR_temp = rcR[selector_3['select1']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    cond2 = vR_temp['question_no_1'] == vR_temp['Answer_no_1']
    cond1 = (vR_temp['question_no_1'].isnull()) | (vR_temp['Answer_no_1'].isnull())
    vR_temp['a1'] = np.select([cond1, cond2], [np.nan, 1], 0)
    
    cond4 = vR_temp['question_no_2'] == vR_temp['Answer_no_2']
    cond3 = (vR_temp['question_no_2'].isnull()) | (vR_temp['Answer_no_2'].isnull())
    vR_temp['a2'] = np.select([cond3, cond4], [np.nan, 1], 0)
    
    cond6 = vR_temp['question_no_3'] == vR_temp['Answer_no_3']
    cond5 = (vR_temp['question_no_3'].isnull()) | (vR_temp['Answer_no_3'].isnull())
    vR_temp['a3'] = np.select([cond5, cond6], [np.nan, 1], 0)
    
    cond8 = vR_temp['question_no_4'] == vR_temp['Answer_no_4']
    cond7 = (vR_temp['question_no_4'].isnull()) | (vR_temp['Answer_no_4'].isnull())
    vR_temp['a4'] = np.select([cond7, cond8], [np.nan, 1], 0)
    
    # Dropping columns  drop_cols_1
    vR_temp = vR_temp.drop(selector_3['drop_cols_1'], axis =1)  
    
    # concatenate values from different columns with delimiter ;
    vR_temp['a'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['q'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['d'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['ge'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    # Dropping more columns
    vR_temp = vR_temp.drop(selector_3['drop_cols_2'], axis =1)  
    
    # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(selector_3['explode']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['a', 'q', 'd', 'ge', 'skill']] = vR_temp[['a', 'q', 'd', 'ge', 'skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['a'])  # remove rows with NaN values in Score 
    vR_temp['a'] = vR_temp['a'].astype(float).astype('int') # set Score as integer
    
    rc_answer = vR_temp
    
    return rc_answer

# ## Melt RC into long format with actual answers
def melt_rc_answer_actual_1A_1B(rcR, selector_3):
    
    vR_temp = rcR[selector_3['select2']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    cond2 = vR_temp['question_no_1'] == vR_temp['Answer_no_1']
    cond1 = (vR_temp['question_no_1'].isnull()) | (vR_temp['Answer_no_1'].isnull())
    vR_temp['a1'] = np.select([cond1, cond2], [np.nan, 1], 0)
    
    cond4 = vR_temp['question_no_2'] == vR_temp['Answer_no_2']
    cond3 = (vR_temp['question_no_2'].isnull()) | (vR_temp['Answer_no_2'].isnull())
    vR_temp['a2'] = np.select([cond3, cond4], [np.nan, 1], 0)
    
    cond6 = vR_temp['question_no_3'] == vR_temp['Answer_no_3']
    cond5 = (vR_temp['question_no_3'].isnull()) | (vR_temp['Answer_no_3'].isnull())
    vR_temp['a3'] = np.select([cond5, cond6], [np.nan, 1], 0)
    
    cond8 = vR_temp['question_no_4'] == vR_temp['Answer_no_4']
    cond7 = (vR_temp['question_no_4'].isnull()) | (vR_temp['Answer_no_4'].isnull())
    vR_temp['a4'] = np.select([cond7, cond8], [np.nan, 1], 0)
    
    vR_temp = vR_temp.drop('Score', axis = 1)
    
    # concatenate values from different columns with delimiter ;
    vR_temp['a'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['ra'] = vR_temp[['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4']].astype('str').agg(';'.join, axis=1)
    vR_temp['aa'] = vR_temp[['Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['q'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['d'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['ge'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    vR_temp = vR_temp.drop(selector_3['drop_cols_3'], axis = 1)
    
     # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(selector_3['explode2']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['a', 'ra', 'aa', 'q', 'd', 'ge', 'skill']] = vR_temp[['a', 'ra', 'aa', 'q', 'd', 'ge', 'skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['a'])  # remove rows with NaN values in Score 
    vR_temp['a'] = vR_temp['a'].astype(float).astype('int') # set Score as integer
    
    rc_answer_actual = vR_temp
    
    return rc_answer_actual

def rc_q_s_pass_rate_1A_1B(rc_answer, selector_3):
    
    # first grouping
    vR_grouped = rc_answer.groupby(selector_3['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count"})
    
    # second grouping
    vR_grouped['Total'] = vR_grouped.groupby(selector_3['groupby2'], dropna =False)['Count'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count'] / vR_grouped['Total']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['a'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_3['sort_values'], ascending = selector_3['sort_order'])
    vR_grouped = vR_grouped.reset_index(drop=True) #re-order df index
    
    rc_question_skill_pass_rate = vR_grouped
    
    return rc_question_skill_pass_rate

def generate_report_3_1A_1B(rcR, selector_3):
    
    rc_answer = rc_fail_rate_1A_1B(rcR, selector_3)
    
    rc_answer_actual = melt_rc_answer_actual_1A_1B(rcR, selector_3)
    
    rc_question_skill_pass_rate = rc_q_s_pass_rate_1A_1B(rc_answer, selector_3)
    
    return rc_question_skill_pass_rate

####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 4 : "RC with Answers" : rc_question_skill_pass_rate_answer_final  --------------------------------------------------------

def rc_q_s_pass_rate_answer_1A_1B(rc_answer_actual, selector_4):
    
    # first grouping
    vR_grouped = rc_answer_actual.groupby(selector_4['groupby1'], dropna =False)['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count"})
    
    # second grouping
    vR_grouped['Total'] = vR_grouped.groupby(selector_4['groupby2'], dropna =False)['Count'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count'] / vR_grouped['Total']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['a'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(selector_4['sort_values'], ascending = selector_4['sort_order'])
    vR_grouped = vR_grouped.reset_index(drop=True) #re-order df index
    
    rc_question_skill_pass_rate_answer = vR_grouped
    
    return rc_question_skill_pass_rate_answer

def generate_report_4_1A_1B(rcR, selector_3, selector_4):
    
    rc_answer_actual = melt_rc_answer_actual_1A_1B(rcR, selector_3)
    
    rc_question_skill_pass_rate_answer_final = rc_q_s_pass_rate_answer_1A_1B(rc_answer_actual, selector_4)  #THIS ONE
   
    return rc_question_skill_pass_rate_answer_final

####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
def generate_all_fail_rate_reports(rcR, v1R, v2R, rc, v1, v2, run_value, pilot_var_selected, rcS, v1S, v2S):
    
    if (pilot_var_selected == 'Pilot 1C' or pilot_var_selected == 'Pilot 1D' or pilot_var_selected == 'Pilot 1E' or pilot_var_selected == 'Pilot 1E(ES)' or 
        pilot_var_selected == 'Pilot 2A' or pilot_var_selected == 'Pilot 2B-A' or pilot_var_selected == 'Pilot 2D' or
        pilot_var_selected == 'Pilot 3A'):
        
        # Report 1 - Near Exact Match - v1_actual_correct_by_question
        selector_1 = report_1_selector(pilot_var_selected)
        v1_actual_correct_by_question =  generate_report_1(v1R, selector_1)

        # Report 2 - Close Match - v2_fail_rates
        selector_2 = report_2_selector(pilot_var_selected)
        v2_fail_rates = generate_report_2(v2R, selector_2, pilot_var_selected)

        # Report 3 - Reading Comprehension - rc_question_skill_pass_rate
        selector_3 = report_3_selector(pilot_var_selected)
        rc_question_skill_pass_rate = generate_report_3(rcR, selector_3)

        # Report 4 - RC with Answers - rc_question_skill_pass_rate_answer_final
        selector_4 = report_4_selector(pilot_var_selected)
        rc_question_skill_pass_rate_answer_final = generate_report_4(rcR, selector_3, selector_4)

        # store all 4 reports into a dictionary set
        list_of_datasets = {"Near Exact Match" : v1_actual_correct_by_question,
                            "Close Match" : v2_fail_rates,
                            "Reading Comprehension" : rc_question_skill_pass_rate,
                            "RC with Answers" : rc_question_skill_pass_rate_answer_final}

        if run_value == 'Deployment':

            # store all 3 summaries into a dictionary set
            list_of_summaries = {"deployment_rc" : rc,
                                "deployment_v1" : v1,
                                "deployment_v2" : v2}

        else:

            # store all 3 summaries into a dictionary set
            list_of_summaries = { pilot_var_selected + "_rc" : rcS,
                                  pilot_var_selected + "_v1" : v1S,
                                  pilot_var_selected + "_v2" : v2S}
            
            
    elif pilot_var_selected == 'Pilot 1A-1B':
        
        # Report 1 - Near Exact Match - v1_actual_correct_by_question
        selector_1 = report_1_selector(pilot_var_selected)
        v1_actual_correct_by_question =  generate_report_1_1A_1B(v1R, selector_1)

        # Report 2 - Close Match - v2_fail_rates
        selector_2 = report_2_selector(pilot_var_selected)
        v2_fail_rates = generate_report_2_1A_1B(v2R, selector_2)

        # Report 3 - Reading Comprehension - rc_question_skill_pass_rate
        selector_3 = report_3_selector(pilot_var_selected)
        rc_question_skill_pass_rate = generate_report_3_1A_1B(rcR, selector_3)

        # Report 4 - RC with Answers - rc_question_skill_pass_rate_answer_final
        selector_4 = report_4_selector(pilot_var_selected)
        rc_question_skill_pass_rate_answer_final = generate_report_4_1A_1B(rcR, selector_3, selector_4)

        # store all 4 reports into a dictionary set
        # v2_fail_rates = v2_actual_correct_by_question_with_answer
        list_of_datasets = {"Near Exact Match" : v1_actual_correct_by_question,
                            "Close Match" : v2_fail_rates,
                            "Reading Comprehension" : rc_question_skill_pass_rate,
                            "RC with Answers" : rc_question_skill_pass_rate_answer_final}

        if run_value == 'Deployment':

            # store all 3 summaries into a dictionary set
            list_of_summaries = {"deployment_rc" : rc,
                                "deployment_v1" : v1,
                                "deployment_v2" : v2}

        else:

            # store all 3 summaries into a dictionary set
            list_of_summaries = { pilot_var_selected + "_rc" : rcS,
                                  pilot_var_selected + "_v1" : v1S,
                                  pilot_var_selected + "_v2" : v2S}

    
    return list_of_datasets, list_of_summaries

def file_check_create(root_path, config, language_selected, run_value, pilot_var_selected):
    
    if run_value == 'Deployment':
        
        run_folder = os.path.join(root_path, config['report']['deliverable'], run_value, language_selected)

        if not os.path.exists(run_folder):
            os.makedirs(run_folder, exist_ok=True)
        
        folder_tag = 'Deployment Summary'
        analysis_folder = os.path.join(root_path, config['report']['analysis'], folder_tag)

        if not os.path.exists(analysis_folder):
            os.makedirs(analysis_folder, exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'RC')):
            os.makedirs(os.path.join(analysis_folder, 'RC'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V1')):
            os.makedirs(os.path.join(analysis_folder, 'V1'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V2')):
            os.makedirs(os.path.join(analysis_folder, 'V2'), exist_ok=True)
            
    else:
        
        run_folder = os.path.join(root_path, config['report']['deliverable'], run_value, pilot_var_selected, language_selected)

        if not os.path.exists(run_folder):
            os.makedirs(run_folder, exist_ok=True)
            
        folder_tag = 'Grand Summary'
        analysis_folder = os.path.join(root_path, config['report']['analysis'], folder_tag)

        if not os.path.exists(analysis_folder):
            os.makedirs(analysis_folder, exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'RC')):
            os.makedirs(os.path.join(analysis_folder, 'RC'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V1')):
            os.makedirs(os.path.join(analysis_folder, 'V1'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V2')):
            os.makedirs(os.path.join(analysis_folder, 'V2'), exist_ok=True)
        
    return run_folder, analysis_folder, folder_tag

def write_fail_report_to_excel(run_folder, list_of_datasets, encoding=None):
    
    with pd.ExcelWriter(os.path.join(run_folder, 'language_fail_rates.xlsx')) as writer:  
        for key, value in list_of_datasets.items():
            value.to_excel(writer, sheet_name=key, index=False, encoding=None)
            
def write_summary_to_excel(analysis_folder, list_of_summaries, encoding=None):
    
    folders = ['RC', 'V1', 'V2']
    for lists, f in zip(list_of_summaries.items(), folders):
        key, value = lists[0], lists[1]
        #value.to_csv(os.path.join(os.path.join(analysis_folder,f), key + '.csv'), index=False, encoding=None)
        value.to_excel(os.path.join(os.path.join(analysis_folder,f), key + '.xlsx'), index=False, encoding=None)

#### Run all 

In [None]:
def main():
    
    print('\nData processing in progress...')
    # import data from data_processing module
    raters, r1, r2, r3, languages, rc, v1, v2, run_value , run_value_2, survey_selected, survey_files, pilot_variation, \
    pilot_selected, pilot_var_selected, rcS, v1S, v2S = data_processing.main()
    print('Data processing completed.')
    print("\n")
    print(languages)
    
    # Get input language selection
    language_selected = language_selection(languages)
      
    # Get data from language modification processes
    rcR, v1R, v2R = get_time_taken_all(language_selected, rc, v1, v2)
    
    print('\nGenerating reports ...')
    
    # Start generating fail rate reports
    list_of_datasets, list_of_summaries = generate_all_fail_rate_reports(rcR, v1R, v2R, rc, v1, v2, run_value, pilot_var_selected, rcS, v1S, v2S)
    
    # Check the run type and language and create folders in reports > deliverables
    run_folder, analysis_folder, folder_tag = file_check_create(root_path, config, language_selected, run_value, pilot_var_selected)
    
    # Write reports to excel file in run_folder path
    write_fail_report_to_excel(run_folder, list_of_datasets)
    
    print(f"\n1. Language fail rates report completed and stored in reports > deliverables > {run_value} > {pilot_var_selected} > {language_selected}")
    
    # Write summaries to csv file in analysis_folder path
    write_summary_to_excel(analysis_folder, list_of_summaries, encoding='utf-8')
    
    print(f"\n2. Summary report completed and stored in analysis > {folder_tag} > RC/V1/V2")
    
    return r1, r2, r3, rc, v1, v2, pilot_var_selected, rcR, v1R, v2R, list_of_datasets, list_of_summaries
    
if __name__ == "__main__":
     
    r1, r2, r3, rc, v1, v2, pilot_var_selected, rcR, v1R, v2R, list_of_datasets, list_of_summaries = main()

In [None]:
#### BLOCK TEMPLATE

####  ------------------------REPORT 1 : "Near Exact Match" - v1_actual_correct_by_question  ------------------------------------------------------------------
####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 2 : "Close Match" - v2_fail_rates  ---------------------------------------------------------------------------------------
####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 3 : "Reading Comprehension" : rc_question_skill_pass_rate  ---------------------------------------------------------------
####  ---------------------------------------------------------------------------------------------------------------------------------------------------------

####  ------------------------REPORT 4 : "RC with Answers" : rc_question_skill_pass_rate_answer_final  --------------------------------------------------------
####  ---------------------------------------------------------------------------------------------------------------------------------------------------------