###################################################################       
#Script Name    :                                                                                              
#Description    :                                                                                 
#Args           :                                                                                           
#Author         : Nikhil Rao in R, converted to Python by Nor Raymond                                              
#Email          : nraymond@appen.com                                          
###################################################################

### Fail Rate Reports for Pilot

In [16]:
import os
import glob 
import pandas as pd
import numpy as np
import yaml
import warnings
from functools import reduce
warnings.filterwarnings("ignore")

In [17]:
# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(config_path, config_name), 'r') as file:
        config = yaml.safe_load(file)

    return config

config_path = "conf/base"

try:
    
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
except:
    
    os.chdir('..')
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
# import data_processing module
import src.data.data_processing as data_processing
# import data_processing module
import src.data.data_cleaning as data_cleaning

In [18]:
def language_selection(languages):

    while True:
        try:
            language_index = int(input("\nPlease select the number of the Language you are assessing: "))
            if language_index < min(languages.index) or language_index > max(languages.index):
                print(f"\nYou must enter numbers between {min(languages.index)} - {max(languages.index)}... Please try again")
                continue
            elif language_index == "":
                print("\nYou must enter any numbers")
                continue
            else:
                print(f"\nYou have selected {language_index} for {languages.iloc[language_index, 0]}")
                language_selected = languages.iloc[language_index, 0]
                break

        except ValueError:
            print(f"\nYou must enter numerical values only... Please try again")
            continue
        else:
            break
            
    return language_selected


#### Functions for Language Modification - getting the overall time taken

In [19]:
# function for Language Modification
def get_time_taken(df, language_selected):

    # Filter data based on selected language
    dfr = df[df['Language'] == language_selected]

    # Time Taken by Item
    dfr["Time_Taken_Seconds"] = (dfr['_created_at'] - dfr['_started_at']).dt.seconds

    # Time Taken Overall
    dfr_grouped = dfr.groupby('_worker_id').sum('Time_Taken_Seconds')
    dfr_grouped["Time_Taken_Minutes_Overall"] = dfr_grouped["Time_Taken_Seconds"] / 60
    dfr_grouped = dfr_grouped.reset_index()
    dfr = pd.merge(dfr, dfr_grouped[["Time_Taken_Minutes_Overall", "_worker_id"]], how = 'left', on = '_worker_id')

    return dfr

def get_time_taken_all(language_selected, rc, v1, v2):
    
    df_list = [rc, v1, v2]
    keys = ["rcR", "v1R", "v2R"]
    df_time = {}
    
    for df, key in zip(df_list, keys) :

        dfr = get_time_taken(df, language_selected)
        df_time[key] = dfr

    rcR, v1R, v2R = df_time["rcR"], df_time["v1R"], df_time["v2R"]    
    
    return rcR, v1R, v2R

#### Functions for calculating Fail Rates

### PILOT 3A

#### REPORT 1 : "Near Exact Match" - v1_actual_correct_by_question

In [20]:
def v1_fail_rate_3A(v1R):  #Valid for 3A
    
    vR_temp = v1R[['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                    'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty'])['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and Fail_rate descending 
    vR_grouped = vR_grouped.sort_values(['Fluency', 'Fail_Rate'], ascending=[True, False])
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def generate_report_1_3A(v1R):
    
    v1_actual_correct_by_question = v1_fail_rate_3A(v1R)
    
    return v1_actual_correct_by_question

#### REPORT 2 : "Close Match" - v2_fail_rates

In [21]:
def v2_fail_rate_3A(v2R):
    
    vR_temp = v2R[['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty'])['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Overall_Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(['Fluency', '_unit_id'], ascending = [True, True])
    
    # drop Score column
    vR_grouped = vR_grouped.drop('Score', axis = 1)
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def v2_fail_rate_2_3A(v2R):
    
    vR_temp = v2R[['Language', 'Fluency', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(['Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                                          'b_register', 'wordphrase_b', 'difficulty'])['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(['Fluency', '_unit_id', 'Score', 'Rate'], ascending = [True, True, True, False])
    
    # drop Score columns
    vR_grouped = vR_grouped.drop(['Score', 'Count_of_Test_Takers', 'Total_Test_Takers'], axis = 1)
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    vR_fail_rates  = pd.pivot_table(vR_fail_rates, 
                           index=['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers'],
                           values='Rate', columns=['rater_answer']).reset_index()
    vR_fail_rates.columns.name = None # remove name for columns
    
    # remove duplicate rows in the dataframe
    vR_fail_rates = vR_fail_rates.drop_duplicates()
    
    return vR_fail_rates 

def merge_v2_fail_rates_3A(v2_actual_correct_by_question, v2_actual_correct_by_question_with_answer):
    
    v2_fail_rates = pd.merge(v2_actual_correct_by_question_with_answer, v2_actual_correct_by_question, how = 'left', 
                            on = ["Language", "Fluency", "_unit_id", "question_", "a_domain", "a_register", "wordphrase_a", "b_domain",
                                  "b_register", "wordphrase_b", "difficulty", "Answers"])
    
#     v2_fail_rates = v2_fail_rates[['Language', 'Fluency', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a',
#                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Count_of_Test_Takers', 'Total_Test_Takers',
#                 'Overall_Fail_Rate', 'Answers', 'a_and_b_are_not_related', 'a_and_b_are_related', 'a_and_b_have_the_same_meaning',
#                 'a_is_more_specific_than_b', 'b_is_more_specific_than_a']]
    
    return v2_fail_rates

def generate_report_2_3A(v2R):
    
    v2_actual_correct_by_question = v2_fail_rate_3A(v2R)

    v2_actual_correct_by_question_with_answer = v2_fail_rate_2_3A(v2R)

    v2_fail_rates = merge_v2_fail_rates_3A(v2_actual_correct_by_question, v2_actual_correct_by_question_with_answer)
    
    return v2_fail_rates

#### REPORT 3 : "Reading Comprehension" : rc_question_skill_pass_rate

In [30]:
def rc_fail_rate_3A(rcR):

    vR_temp = rcR[['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity', 
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    if vR_temp['question_no_1'].isnull().all() == True or vR_temp['Answer_no_1'].isnull().all() == True:      
        vR_temp['a1'] = np.nan      
    else:   
        vR_temp['a1'] = np.where(vR_temp['question_no_1'] == vR_temp['Answer_no_1'], 1, 0).astype('str')
        
    if vR_temp['question_no_2'].isnull().all() == True or vR_temp['Answer_no_2'].isnull().all() == True:        
        vR_temp['a2'] = np.nan      
    else:       
        vR_temp['a2'] = np.where(vR_temp['question_no_2'] == vR_temp['Answer_no_2'], 1, 0).astype('str')      
        
    if vR_temp['question_no_3'].isnull().all() == True or vR_temp['Answer_no_3'].isnull().all() == True:  
        vR_temp['a3'] = np.nan 
    else:
        vR_temp['a3'] = np.where(vR_temp['question_no_3'] == vR_temp['Answer_no_3'], 1, 0).astype('str')
        
    if vR_temp['question_no_4'].isnull().all() == True or vR_temp['Answer_no_4'].isnull().all() == True:   
        vR_temp['a4'] = np.nan 
    else:
        vR_temp['a4'] = np.where(vR_temp['question_no_4'] == vR_temp['Answer_no_4'], 1, 0).astype('str')
    
    # Dropping columns
    vR_temp = vR_temp.drop(['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4', 'Score'], axis =1)  
    
    # concatenate values from different columns with delimiter ;
    vR_temp['Score'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Question'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['Difficulty'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Google_Translate_Error'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    # Dropping more columns
    vR_temp = vR_temp.drop(['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'a1', 'a2', 'a3', 'a4'], axis =1)  
    
    # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds',
       '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type',
       'complexity', 'familiarity']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['Score', 'Question', 'Difficulty', 'Google_Translate_Error', 'Skill']] = vR_temp[['Score', 'Question', 'Difficulty', 
                                                                                               'Google_Translate_Error', 'Skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['Score'])  # remove rows with NaN values in Score 
    vR_temp['Score'] = vR_temp['Score'].astype('int') # set Score as integer
    
    rc_answer = vR_temp
    
    return vR_temp

## Melt RC and categorize question choice with letter and question number
def melt_rc_assign_3A(rc_choices, q_list, choice_list):
    
    df=[]
    for ql in q_list:
        for cl in choice_list:
            df_temp_1 = rc_choices[rc_choices['variable'].str.contains('question_' + str(ql))]
            df_temp_2 = df_temp_1[df_temp_1['variable'].str.contains('choice_' + str(cl))]
            df_temp_2['Question'] = 'Question ' + str(ql)
            if cl == 1 :
                df_temp_2['Answer'] = 'a'
            elif cl == 2 :
                df_temp_2['Answer'] = 'b'
            elif cl == 3 :
                df_temp_2['Answer'] = 'c'
            df.append(df_temp_2)
            
    rc_choices = pd.concat(df)
    return rc_choices

## Melt RC and categorize question choice with letter and question number
def melt_rc_3A(rcR):

    vR_temp = rcR[['Language', '_unit_id', 'title', 'test_',
                'question_1_choice_1', 'question_1_choice_2', 'question_1_choice_3',
                'question_2_choice_1', 'question_2_choice_2', 'question_2_choice_3',
                'question_3_choice_1', 'question_3_choice_2', 'question_3_choice_3',
                'question_4_choice_1', 'question_4_choice_2', 'question_4_choice_3']]
    
    # remove duplicate rows in the dataframe
    vR_temp = vR_temp.drop_duplicates().reset_index(drop=True)
    
    vR_temp = pd.melt(vR_temp, id_vars=['Language', '_unit_id', 'title', 'test_'])
    
    rc_choices = vR_temp
    
    q_list, choice_list = [1,2,3,4], [1,2,3]
    rc_choices = melt_rc_assign_3A(rc_choices, q_list, choice_list)
    rc_choices = rc_choices[['Language', '_unit_id', 'title', 'test_', 'Question', 'Answer', 'variable', 'value']]
    rc_choices = rc_choices.sort_values(['Language', 'title', 'test_', 'Question', 'Answer'])
    
    actual_answer = rc_choices
    rater_answer = rc_choices
    
    return rc_choices, actual_answer, rater_answer

# ## Melt RC into long format with actual answers
def melt_rc_answer_actual_3A(rcR):
    
    vR_temp = rcR[['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity',
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    if vR_temp['question_no_1'].isnull().all() == True or vR_temp['Answer_no_1'].isnull().all() == True:      
        vR_temp['a1'] = np.nan      
    else:   
        vR_temp['a1'] = np.where(vR_temp['question_no_1'] == vR_temp['Answer_no_1'], 1, 0).astype('str')
        
    if vR_temp['question_no_2'].isnull().all() == True or vR_temp['Answer_no_2'].isnull().all() == True:        
        vR_temp['a2'] = np.nan      
    else:       
        vR_temp['a2'] = np.where(vR_temp['question_no_2'] == vR_temp['Answer_no_2'], 1, 0).astype('str')      
        
    if vR_temp['question_no_3'].isnull().all() == True or vR_temp['Answer_no_3'].isnull().all() == True:  
        vR_temp['a3'] = np.nan 
    else:
        vR_temp['a3'] = np.where(vR_temp['question_no_3'] == vR_temp['Answer_no_3'], 1, 0).astype('str')
        
    if vR_temp['question_no_4'].isnull().all() == True or vR_temp['Answer_no_4'].isnull().all() == True:   
        vR_temp['a4'] = np.nan 
    else:
        vR_temp['a4'] = np.where(vR_temp['question_no_4'] == vR_temp['Answer_no_4'], 1, 0).astype('str')
    
    vR_temp = vR_temp.drop('Score', axis = 1)
    
    # concatenate values from different columns with delimiter ;
    vR_temp['Score'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Rater_Answer'] = vR_temp[['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4']].astype('str').agg(';'.join, axis=1)
    vR_temp['Actual_Answer'] = vR_temp[['Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Question'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['Difficulty'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Google_Translate_Error'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    vR_temp = vR_temp.drop(['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                            'a1', 'a2', 'a3', 'a4'], axis = 1)
    
     # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds',
       '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type',
       'complexity', 'familiarity']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['Score', 'Rater_Answer', 'Actual_Answer', 'Question', 'Difficulty', 'Google_Translate_Error', 'Skill']] = vR_temp[['Score', 'Rater_Answer', 
                                                                                                                                'Actual_Answer','Question', 
                                                                                                                                'Difficulty', 
                                                                                                                                'Google_Translate_Error', 
                                                                                                                                'Skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['Score'])  # remove rows with NaN values in Score 
    vR_temp['Score'] = vR_temp['Score'].astype('int') # set Score as integer
    
    rc_answer_actual = vR_temp
    
    return rc_answer_actual

def rc_q_s_pass_rate_3A(rc_answer):
    
    # first grouping
    vR_grouped = rc_answer.groupby(['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Score', 'Question', 'Difficulty', 'register', 'Skill'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count"})
    
    # second grouping
    vR_grouped['Total'] = vR_grouped.groupby(['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Question', 'Difficulty', 'register', 'Skill'])['Count'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count'] / vR_grouped['Total']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(['Fluency', 'Fail_Rate'], ascending = [True, False])
    vR_grouped = vR_grouped.reset_index(drop=True) #re-order df index
    
    rc_question_skill_pass_rate = vR_grouped
    
    return rc_question_skill_pass_rate

def generate_report_3_3A(rcR):
    
    rc_answer = rc_fail_rate_3A(rcR)
    
    rc_choices, actual_answer, rater_answer = melt_rc_3A(rcR)
    
    rc_answer_actual = melt_rc_answer_actual_3A(rcR)
    
    rc_question_skill_pass_rate = rc_q_s_pass_rate_3A(rc_answer)
    
    return rc_question_skill_pass_rate

#### REPORT 4 : "RC with Answers" : rc_question_skill_pass_rate_answer_final

In [23]:
def rc_q_s_pass_rate_answer_3A(rc_answer_actual):
    
    # first grouping
    vR_grouped = rc_answer_actual.groupby(['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Actual_Answer', 'Rater_Answer', 
                                    'Score', 'Question', 'Difficulty', 'register', 'Skill'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count"})
    
    # second grouping
    vR_grouped['Total'] = vR_grouped.groupby(['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Question', 'Difficulty', 'register', 'Skill'])['Count'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count'] / vR_grouped['Total']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(['Fluency', '_unit_id', 'Question', 'Fail_Rate'], ascending = [True, True, True, False])
    vR_grouped = vR_grouped.reset_index(drop=True) #re-order df index
    
    rc_question_skill_pass_rate_answer = vR_grouped
    
    return rc_question_skill_pass_rate_answer

def join_rc_q_s_pass_rate_answer_3A(rc_question_skill_pass_rate_answer, actual_answer, rater_answer):
    
    first_join = rc_question_skill_pass_rate_answer
    first_join = pd.merge(first_join, actual_answer, how = 'left', 
                            left_on = ["Language", "_unit_id", "title" , "test_", "Question", "Actual_Answer"],
                            right_on = ["Language", "_unit_id", "title" , "test_", "Question", "Answer"])
    first_join = first_join.drop('Answer', axis=1)
    
    second_join = pd.merge(first_join, rater_answer, how = 'left', 
                            left_on = ["Language", "_unit_id", "title" , "test_", "Question", "Rater_Answer"],
                            right_on = ["Language", "_unit_id", "title" , "test_", "Question", "Answer"])
    second_join = second_join.drop('Answer', axis=1)
    
    second_join = second_join[['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Difficulty', 'register', 'Skill', 'Question',
                               'Actual_Answer', 'value_x', 'Rater_Answer', 'value_y', 'Count', 'Total', 'Fail_Rate']]
  
    second_join = second_join.rename(columns = { "Actual_Answer" : "Actual_Answer_Letter", 
                                       "value_x" : "Actual_Answer_Text",
                                       "Rater_Answer" : "Rater_Answer_Letter",
                                       "value_y" : "Rater_Answer_Text"})

    rc_question_skill_pass_rate_answer_final = second_join
    
    return rc_question_skill_pass_rate_answer_final


def generate_report_4_3A(rcR):
    
    rc_choices, actual_answer, rater_answer = melt_rc_3A(rcR)
    
    rc_answer_actual = melt_rc_answer_actual_3A(rcR)
    
    rc_question_skill_pass_rate_answer = rc_q_s_pass_rate_answer_3A(rc_answer_actual)

    rc_question_skill_pass_rate_answer_final = join_rc_q_s_pass_rate_answer_3A(rc_question_skill_pass_rate_answer, actual_answer, rater_answer)
    
    return rc_question_skill_pass_rate_answer_final

### PILOT 1A-1B

#### REPORT 1 : "Near Exact Match" - v1_actual_correct_by_question

In [44]:
def v1_fail_rate_1A_1B(v1R):  #Valid for 3A
    
    vR_temp = v1R[['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                    'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']]
       
    # first grouping
    vR_grouped = vR_temp.groupby(['Language', 'Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(['Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty'])['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and Fail_rate descending 
    vR_grouped = vR_grouped.sort_values(['Fluency', 'Tenure', 'Fail_Rate'], ascending=[True, True, False])
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def generate_report_1_1A_1B(v1R):
    
    v1_actual_correct_by_question = v1_fail_rate_1A_1B(v1R)
    
    return v1_actual_correct_by_question

#### REPORT 2 : "Close Match" - v2_fail_rates

In [46]:
def v2_fail_rate_1A_1B(v2R):
    
    vR_temp = v2R[['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(['Language', 'Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'Answers', 'Score'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(['Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty'])['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(['Fluency', 'Tenure', 'Fail_Rate'], ascending = [True, True, False])
    
    # drop Score column
    vR_grouped = vR_grouped.drop('Score', axis = 1)
    
    vR_fail_rates = vR_grouped.reset_index(drop=True) #re-order df index
    
    return vR_fail_rates

def v2_fail_rate_2_1A_1B(v2R):
    
    vR_temp = v2R[['Language', 'Fluency', 'Tenure', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                   'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score']]
    
    # first grouping
    vR_grouped = vR_temp.groupby(['Language', 'Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                  'b_register', 'wordphrase_b', 'difficulty', 'rater_answer', 'Answers', 'Score'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    # second grouping
    vR_grouped['Total_Test_Takers'] = vR_grouped.groupby(['Fluency', 'Tenure', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 'b_domain', 
                                                          'b_register', 'wordphrase_b', 'difficulty'])['Count_of_Test_Takers'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count_of_Test_Takers'] / vR_grouped['Total_Test_Takers']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(['Fluency', 'Tenure', '_unit_id', 'Score', 'Rate'], ascending = [True, True, True, False])
    
    v2_actual_correct_by_question_with_answer = vR_grouped
    
    return v2_actual_correct_by_question_with_answer

def generate_report_2_1A_1B(v2R):
    
    v2_actual_correct_by_question = v2_fail_rate_1A_1B(v2R)

    v2_actual_correct_by_question_with_answer = v2_fail_rate_2_1A_1B(v2R)

    return v2_actual_correct_by_question_with_answer

#### REPORT 3 : "Reading Comprehension" : rc_question_skill_pass_rate

In [47]:
def rc_fail_rate_1A_1B(rcR):

    vR_temp = rcR[['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity', 
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    if vR_temp['question_no_1'].isnull().all() == True or vR_temp['Answer_no_1'].isnull().all() == True:      
        vR_temp['a1'] = np.nan      
    else:   
        vR_temp['a1'] = np.where(vR_temp['question_no_1'] == vR_temp['Answer_no_1'], 1, 0).astype('str')
        
    if vR_temp['question_no_2'].isnull().all() == True or vR_temp['Answer_no_2'].isnull().all() == True:        
        vR_temp['a2'] = np.nan      
    else:       
        vR_temp['a2'] = np.where(vR_temp['question_no_2'] == vR_temp['Answer_no_2'], 1, 0).astype('str')      
        
    if vR_temp['question_no_3'].isnull().all() == True or vR_temp['Answer_no_3'].isnull().all() == True:  
        vR_temp['a3'] = np.nan 
    else:
        vR_temp['a3'] = np.where(vR_temp['question_no_3'] == vR_temp['Answer_no_3'], 1, 0).astype('str')
        
    if vR_temp['question_no_4'].isnull().all() == True or vR_temp['Answer_no_4'].isnull().all() == True:   
        vR_temp['a4'] = np.nan 
    else:
        vR_temp['a4'] = np.where(vR_temp['question_no_4'] == vR_temp['Answer_no_4'], 1, 0).astype('str')
    
    # Dropping columns
    vR_temp = vR_temp.drop(['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4', 'Score'], axis =1)  
    
    # concatenate values from different columns with delimiter ;
    vR_temp['Score'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Question'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['Difficulty'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Google_Translate_Error'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    # Dropping more columns
    vR_temp = vR_temp.drop(['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'a1', 'a2', 'a3', 'a4'], axis =1)  
    
    # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds',
       '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type',
       'complexity', 'familiarity']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['Score', 'Question', 'Difficulty', 'Google_Translate_Error', 'Skill']] = vR_temp[['Score', 'Question', 'Difficulty', 
                                                                                               'Google_Translate_Error', 'Skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['Score'])  # remove rows with NaN values in Score 
    vR_temp['Score'] = vR_temp['Score'].astype('int') # set Score as integer
    
    rc_answer = vR_temp
    
    return vR_temp

## Melt RC and categorize question choice with letter and question number
def melt_rc_assign_1A_1B(rc_choices, q_list, choice_list):
    
    df=[]
    for ql in q_list:
        for cl in choice_list:
            df_temp_1 = rc_choices[rc_choices['variable'].str.contains('question_' + str(ql))]
            df_temp_2 = df_temp_1[df_temp_1['variable'].str.contains('choice_' + str(cl))]
            df_temp_2['Question'] = 'Question ' + str(ql)
            if cl == 1 :
                df_temp_2['Answer'] = 'a'
            elif cl == 2 :
                df_temp_2['Answer'] = 'b'
            elif cl == 3 :
                df_temp_2['Answer'] = 'c'
            df.append(df_temp_2)
            
    rc_choices = pd.concat(df)
    return rc_choices

## Melt RC and categorize question choice with letter and question number
def melt_rc_1A_1B(rcR):

    vR_temp = rcR[['Language', '_unit_id', 'title', 'test_',
                'question_1_choice_1', 'question_1_choice_2', 'question_1_choice_3',
                'question_2_choice_1', 'question_2_choice_2', 'question_2_choice_3',
                'question_3_choice_1', 'question_3_choice_2', 'question_3_choice_3',
                'question_4_choice_1', 'question_4_choice_2', 'question_4_choice_3']]
    
    # remove duplicate rows in the dataframe
    vR_temp = vR_temp.drop_duplicates().reset_index(drop=True)
    
    vR_temp = pd.melt(vR_temp, id_vars=['Language', '_unit_id', 'title', 'test_'])
    
    rc_choices = vR_temp
    
    q_list, choice_list = [1,2,3,4], [1,2,3]
    rc_choices = melt_rc_assign_1A_1B(rc_choices, q_list, choice_list)
    rc_choices = rc_choices[['Language', '_unit_id', 'title', 'test_', 'Question', 'Answer', 'variable', 'value']]
    rc_choices = rc_choices.sort_values(['Language', 'title', 'test_', 'Question', 'Answer'])
    
    actual_answer = rc_choices
    rater_answer = rc_choices
    
    return rc_choices, actual_answer, rater_answer

# ## Melt RC into long format with actual answers
def melt_rc_answer_actual_1A_1B(rcR):
    
    vR_temp = rcR[['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds', '_unit_id', 'title', 'test_',
                'question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested',
                'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                'register', 'topic', 'text_type', 'complexity', 'familiarity',
                'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                'Score']]
    
    # evaluate if Answers are the same as the questions. If either Q or A are empty, return NaN
    if vR_temp['question_no_1'].isnull().all() == True or vR_temp['Answer_no_1'].isnull().all() == True:      
        vR_temp['a1'] = np.nan      
    else:   
        vR_temp['a1'] = np.where(vR_temp['question_no_1'] == vR_temp['Answer_no_1'], 1, 0).astype('str')
        
    if vR_temp['question_no_2'].isnull().all() == True or vR_temp['Answer_no_2'].isnull().all() == True:        
        vR_temp['a2'] = np.nan      
    else:       
        vR_temp['a2'] = np.where(vR_temp['question_no_2'] == vR_temp['Answer_no_2'], 1, 0).astype('str')      
        
    if vR_temp['question_no_3'].isnull().all() == True or vR_temp['Answer_no_3'].isnull().all() == True:  
        vR_temp['a3'] = np.nan 
    else:
        vR_temp['a3'] = np.where(vR_temp['question_no_3'] == vR_temp['Answer_no_3'], 1, 0).astype('str')
        
    if vR_temp['question_no_4'].isnull().all() == True or vR_temp['Answer_no_4'].isnull().all() == True:   
        vR_temp['a4'] = np.nan 
    else:
        vR_temp['a4'] = np.where(vR_temp['question_no_4'] == vR_temp['Answer_no_4'], 1, 0).astype('str')
    
    vR_temp = vR_temp.drop('Score', axis = 1)
    
    # concatenate values from different columns with delimiter ;
    vR_temp['Score'] = vR_temp[['a1', 'a2', 'a3', 'a4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Rater_Answer'] = vR_temp[['question_no_1', 'question_no_2', 'question_no_3', 'question_no_4']].astype('str').agg(';'.join, axis=1)
    vR_temp['Actual_Answer'] = vR_temp[['Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Question'] = ';'.join(['Question 1', 'Question 2', 'Question 3', 'Question 4'])
    vR_temp['Difficulty'] = vR_temp[['question_1_difficulty', 'question_2_difficulty', 
                                     'question_3_difficulty', 'question_4_difficulty']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Google_Translate_Error'] = vR_temp[['question_1_google_translate_error', 
                                                 'question_2_google_translate_error', 
                                                 'question_3_google_translate_error', 
                                                 'question_4_google_translate_error']].astype('str').agg(';'.join, axis=1) 
    vR_temp['Skill'] = vR_temp[['Question 1 Skill tested', 'Question 2 Skill tested', 
                                'Question 3 Skill tested', 'Question 4 Skill tested']].astype('str').agg(';'.join, axis=1) 
    
    vR_temp = vR_temp.drop(['question_1_difficulty', 'question_1_google_translate_error', 'Question 1 Skill tested', 
                            'question_2_difficulty', 'question_2_google_translate_error', 'Question 2 Skill tested',
                            'question_3_difficulty', 'question_3_google_translate_error', 'Question 3 Skill tested',
                            'question_4_difficulty', 'question_4_google_translate_error', 'Question 4 Skill tested',
                            'question_no_1', 'question_no_2', 'question_no_3', 'question_no_4',
                            'Answer_no_1', 'Answer_no_2', 'Answer_no_3', 'Answer_no_4',
                            'a1', 'a2', 'a3', 'a4'], axis = 1)
    
     # Python explode function to split delimited columns and expand to rows - row_separate in R
    vR_temp =  vR_temp.set_index(['Language', '_worker_id', '_country', 'Fluency', 'Time_Taken_Seconds',
       '_unit_id', 'title', 'test_', 'register', 'topic', 'text_type',
       'complexity', 'familiarity']).apply(lambda x: x.str.split(';').explode()).reset_index()
    
    vR_temp[['Score', 'Rater_Answer', 'Actual_Answer', 'Question', 'Difficulty', 'Google_Translate_Error', 'Skill']] = vR_temp[['Score', 'Rater_Answer', 
                                                                                                                                'Actual_Answer','Question', 
                                                                                                                                'Difficulty', 
                                                                                                                                'Google_Translate_Error', 
                                                                                                                                'Skill']].replace('nan', np.nan)
    vR_temp = vR_temp.dropna(subset = ['Score'])  # remove rows with NaN values in Score 
    vR_temp['Score'] = vR_temp['Score'].astype('int') # set Score as integer
    
    rc_answer_actual = vR_temp
    
    return rc_answer_actual

def rc_q_s_pass_rate_1A_1B(rc_answer):
    
    # first grouping
    vR_grouped = rc_answer.groupby(['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Score', 'Question', 'Difficulty', 'register', 'Skill'])['_worker_id'].count().reset_index()
    vR_grouped = vR_grouped.rename(columns = {"_worker_id" : "Count"})
    
    # second grouping
    vR_grouped['Total'] = vR_grouped.groupby(['Language', 'Fluency', '_unit_id', 'title', 'test_', 'Question', 'Difficulty', 'register', 'Skill'])['Count'].transform('sum')   
    vR_grouped['Fail_Rate'] = round((vR_grouped['Count'] / vR_grouped['Total']), 2)
    
    # filter Score 0 
    vR_grouped = vR_grouped[vR_grouped['Score'] == 0]
    
    # sort values by Market and _unit_id 
    vR_grouped = vR_grouped.sort_values(['Fluency', 'Fail_Rate'], ascending = [True, False])
    vR_grouped = vR_grouped.reset_index(drop=True) #re-order df index
    
    rc_question_skill_pass_rate = vR_grouped
    
    return rc_question_skill_pass_rate

def generate_report_3_1A_1B(rcR):
    
    rc_answer = rc_fail_rate_1A_1B(rcR)
    
    rc_choices, actual_answer, rater_answer = melt_rc_1A_1B(rcR)
    
    rc_answer_actual = melt_rc_answer_actual_1A_1B(rcR)
    
    rc_question_skill_pass_rate = rc_q_s_pass_rate_1A_1B(rc_answer)
    
    return rc_question_skill_pass_rate

In [41]:
def generate_all_fail_rate_reports(rcR, v1R, v2R, rc, v1, v2, run_value, pilot_var_selected):
    
    if pilot_var_selected == 'Pilot 3A':
        
        # Report 1 - Near Exact Match - v1_actual_correct_by_question
        v1_actual_correct_by_question =  generate_report_1_3A(v1R)

        # Report 2 - Close Match - v2_fail_rates
        v2_fail_rates = generate_report_2_3A(v2R)

        # Report 3 - Reading Comprehension - rc_question_skill_pass_rate
        rc_question_skill_pass_rate = generate_report_3_3A(rcR)

        # Report 4 - RC with Answers - rc_question_skill_pass_rate_answer_final
        rc_question_skill_pass_rate_answer_final = generate_report_4_3A(rcR)

        # store all 4 reports into a dictionary set
        list_of_datasets = {"Near Exact Match" : v1_actual_correct_by_question,
                            "Close Match" : v2_fail_rates,
                            "Reading Comprehension" : rc_question_skill_pass_rate,
                            "RC with Answers" : rc_question_skill_pass_rate_answer_final}

        if run_value == 'Deployment':

            # store all 3 summaries into a dictionary set
            list_of_summaries = {"deployment_rc" : rc,
                                "deployment_v1" : v1,
                                "deployment_v2" : v2}

        else:

            # store all 3 summaries into a dictionary set
            list_of_summaries = { run_value + "_rc" : rc,
                                  run_value + "_v1" : v1,
                                  run_value + "_v2" : v2}

    
    return list_of_datasets, list_of_summaries

def file_check_create(root_path, config, language_selected, run_value, pilot_var_selected):
    
    if run_value == 'Deployment':
        
        run_folder = os.path.join(root_path, config['report']['deliverable'], run_value, language_selected)

        if not os.path.exists(run_folder):
            os.makedirs(run_folder, exist_ok=True)
        
        folder_tag = 'Deployment Summary'
        analysis_folder = os.path.join(root_path, config['report']['analysis'], folder_tag)

        if not os.path.exists(analysis_folder):
            os.makedirs(analysis_folder, exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'RC')):
            os.makedirs(os.path.join(analysis_folder, 'RC'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V1')):
            os.makedirs(os.path.join(analysis_folder, 'V1'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V2')):
            os.makedirs(os.path.join(analysis_folder, 'V2'), exist_ok=True)
            
    else:
        
        run_folder = os.path.join(root_path, config['report']['deliverable'], run_value, pilot_var_selected, language_selected)

        if not os.path.exists(run_folder):
            os.makedirs(run_folder, exist_ok=True)
            
        folder_tag = 'Grand Summary'
        analysis_folder = os.path.join(root_path, config['report']['analysis'], folder_tag)

        if not os.path.exists(analysis_folder):
            os.makedirs(analysis_folder, exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'RC')):
            os.makedirs(os.path.join(analysis_folder, 'RC'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V1')):
            os.makedirs(os.path.join(analysis_folder, 'V1'), exist_ok=True)
            
        if not os.path.exists(os.path.join(analysis_folder, 'V2')):
            os.makedirs(os.path.join(analysis_folder, 'V2'), exist_ok=True)
        
    return run_folder, analysis_folder, folder_tag

def write_fail_report_to_excel(run_folder, list_of_datasets, encode=None):
    
    with pd.ExcelWriter(os.path.join(run_folder, 'language_fail_rates.xlsx')) as writer:  
        for key, value in list_of_datasets.items():
            value.to_excel(writer, sheet_name=key, index=False, encoding=encode)
            
def write_summary_to_csv(analysis_folder, list_of_summaries, encode=None):
    
    folders = ['RC', 'V1', 'V2']
    for lists, f in zip(list_of_summaries.items(), folders):
        key, value = lists[0], lists[1]
        value.to_csv(os.path.join(os.path.join(analysis_folder,f), key + '.csv'), index=False, encoding=encode)

#### Run all 

In [42]:
def main():
    
    print('\nData processing in progress...')
    # import data from data_processing module
    raters, r1, r2, r3, languages, rc, v1, v2, run_value , run_value_2, survey_selected, survey_files, pilot_variation, pilot_selected, pilot_var_selected = data_processing.main()
    print('Data processing completed.')
    print("\n")
    print(languages)
    
    # Get input language selection
    language_selected = language_selection(languages)
      
    # Get data from language modification processes
    rcR, v1R, v2R = get_time_taken_all(language_selected, rc, v1, v2)
    
    print('\nGenerating reports ...')
    
    # Start generating fail rate reports
    list_of_datasets, list_of_summaries = generate_all_fail_rate_reports(rcR, v1R, v2R, rc, v1, v2, run_value, pilot_var_selected)
    
    # Check the run type and language and create folders in reports > deliverables
    run_folder, analysis_folder, folder_tag = file_check_create(root_path, config, language_selected, run_value, pilot_var_selected)
    
    # Write reports to excel file in run_folder path
    write_fail_report_to_excel(run_folder, list_of_datasets, encode=None)
    
    print(f"\n1. Language fail rates report completed and stored in reports > deliverables > {run_value} > {pilot_var_selected} > {language_selected}")
    
    # Write summaries to csv file in analysis_folder path
    write_summary_to_csv(analysis_folder, list_of_summaries, encode=None)
    
    print(f"\n2. Summary report completed and stored in analysis > {folder_tag} > RC/V1/V2")
    
    return r1, r2, r3, rc, v1, v2, pilot_var_selected, rcR, v1R, v2R, list_of_datasets, list_of_summaries
    
if __name__ == "__main__":
     
    r1, r2, r3, rc, v1, v2, pilot_var_selected, rcR, v1R, v2R, list_of_datasets, list_of_summaries = main()


Data processing in progress...
Initialize data ingestion and file checking...

PASS: All files exists!



Please input the type of run e.g. Deployment, Pilot 1, Pilot 2, Pilot 3 .... etc.:  Pilot 3



Run type: Pilot 3



Please input the pilot subfolder name e.g. Pilot 1A, Pilot 2C, Pilot 3A-B .... etc.:  Pilot 3A



Pilot subfolder: Pilot 3A



Do you know the 'Language' and/or 'Market code' for this file? (y/n) :  y

Please enter the Language:  Turkish

Please enter the Market code: eg. EN-EN for English :  TR-TR



Starting automated data cleaning....

Dataframe created from RC file
Language and Market columns and values inserted to 'Summary' sheet
Language column and values inserted to 'Data' sheet
Missing columns inserted into 'Data' sheet.

Preview cleaned datasets:



df_summary_cleaned


  Language Market  _worker_id  Score  Percentage  Grouping
0  Turkish  TR-TR    45488787     24    1.000000  Pilot 3A
1  Turkish  TR-TR    45638661     24    1.000000  Pilot 3A
2  Turkish  TR-TR    45638934     24    1.000000  Pilot 3A
3  Turkish  TR-TR    45758795     23    0.958333  Pilot 3A
4  Turkish  TR-TR    45764098     24    1.000000  Pilot 3A


df_data_cleaned


  Language           _id question_no_1 question_no_2 question_no_3  \
0  Turkish  5.868341e+09             a             c             b   
1  Turkish  5.868379e+09             a             c             b   
2  Turkish  5.868379e+09             a             c             b   
3  Turkish  5.868425e+09             a             c          


Please select the number of the pilot variation:  8



You have selected 8 for 'Pilot 3A'

               Survey Filename
0  Survey Pilot 1A and 1B.xlsx
1          Survey Pilot 2.xlsx
2    Survey Pilot 2 and 3.xlsx



Please select the number of the survey filename for your pilot run:  2



You have selected 2 for 'Survey Pilot 2 and 3.xlsx'

Data processing completed.


  Language
0  Turkish



Please select the number of the Language you are assessing:  0



You have selected 0 for Turkish

Generating reports ...

1. Language fail rates report completed and stored in reports > deliverables > Pilot 3 > Pilot 3A > Turkish

2. Summary report completed and stored in analysis > Grand Summary > RC/V1/V2


In [43]:
rc

Unnamed: 0,Language,_id,question_no_1,question_no_2,question_no_3,question_no_4,question_no_5,changes,complexity,familiarity,...,Question 1 Skill tested,Question 2 Skill tested,Question 3 Skill tested,Question 4 Skill tested,Question 5 Skill tested,Grouping,31_language_1,survey_created_at,survey_started_at,Fluency
0,Turkish,5868340900,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-23 22:01:18,2020-12-23 21:58:15,Fluent
1,Turkish,5868379114,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-23 21:52:09,2020-12-23 21:47:05,Fluent
2,Turkish,5868379132,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-23 22:11:33,2020-12-23 22:07:06,Fluent
3,Turkish,5868424995,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,GT,,NaT,NaT,GT
4,Turkish,5868942308,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-24 07:35:27,2020-12-24 07:26:44,Fluent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Turkish,5871995545,b,c,b,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-27 21:24:30,2020-12-27 21:23:00,Fluent
204,Turkish,5872385930,b,b,a,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-28 07:35:45,2020-12-28 07:33:19,Fluent
205,Turkish,5872888505,b,b,a,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-28 10:17:44,2020-12-28 10:12:23,Fluent
206,Turkish,5872947603,b,b,b,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,initial understanding: finding key details,initial understanding: finding the main idea,synthesis and decision-making,0,,GT,,NaT,NaT,GT


In [33]:
v1

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register,difficulty,familiarity,question_,...,_region,_city,_ip,Answer,Score,Grouping,31_language_1,survey_created_at,survey_started_at,Fluency
0,Turkish,5868295106,yes,season,neutral,season,neutral,easy,familiar,1,...,34.0,Istanbul,159.146.43.95,yes,1,Pilot 3A,over_15_years,2020-12-23 22:01:18,2020-12-23 21:58:15,Fluent
1,Turkish,5868300856,yes,season,neutral,season,neutral,easy,familiar,1,...,34.0,Istanbul,107.150.95.12,yes,1,Pilot 3A,over_15_years,2020-12-23 22:11:33,2020-12-23 22:07:06,Fluent
2,Turkish,5868300866,yes,season,neutral,season,neutral,easy,familiar,1,...,34.0,Istanbul,107.150.95.12,yes,1,Pilot 3A,over_15_years,2020-12-23 21:52:09,2020-12-23 21:47:05,Fluent
3,Turkish,5868359497,yes,season,neutral,season,neutral,easy,familiar,1,...,20.0,Imus,124.106.180.38,yes,1,GT,,NaT,NaT,GT
4,Turkish,5868406548,yes,season,neutral,season,neutral,easy,familiar,1,...,2.0,Kellyville,101.179.219.27,yes,1,GT,,NaT,NaT,GT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,Turkish,5871973241,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,54.0,Sakarya,88.230.169.32,no,1,Pilot 3A,over_15_years,2020-12-27 21:24:30,2020-12-27 21:23:00,Fluent
1114,Turkish,5872312585,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,68.0,Ankara,46.155.64.92,no,1,Pilot 3A,over_15_years,2020-12-28 07:35:45,2020-12-28 07:33:19,Fluent
1115,Turkish,5872622491,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,35.0,Izmir,176.88.68.107,no,1,Pilot 3A,over_15_years,2020-12-28 10:17:44,2020-12-28 10:12:23,Fluent
1116,Turkish,5872921698,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,20.0,Imus,124.106.180.38,no,1,GT,,NaT,NaT,GT


In [34]:
v2

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register,difficulty,familiarity,question_,...,_ip,Answer,Alternate Answer,Score,Answers,Grouping,31_language_1,survey_created_at,survey_started_at,Fluency
0,Turkish,5868301063,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,159.146.43.95,a_and_b_have_the_same_meaning,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-23 22:01:18,2020-12-23 21:58:15,Fluent
1,Turkish,5868344334,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,107.150.95.100,a_and_b_have_the_same_meaning,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-23 22:11:33,2020-12-23 22:07:06,Fluent
2,Turkish,5868344361,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,91.140.83.182,a_and_b_have_the_same_meaning,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-23 21:52:09,2020-12-23 21:47:05,Fluent
3,Turkish,5868410262,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,101.179.219.27,a_and_b_have_the_same_meaning,0,1,a_and_b_have_the_same_meaning;0,GT,,NaT,NaT,GT
4,Turkish,5868780952,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,31.141.60.16,a_and_b_have_the_same_meaning,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-24 07:35:27,2020-12-24 07:26:44,Fluent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,Turkish,5871990756,a_and_b_are_not_related,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,88.230.169.32,a_and_b_are_not_related,0,1,a_and_b_are_not_related;0,Pilot 3A,over_15_years,2020-12-27 21:24:30,2020-12-27 21:23:00,Fluent
2596,Turkish,5872365862,a_and_b_are_not_related,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,46.155.64.92,a_and_b_are_not_related,0,1,a_and_b_are_not_related;0,Pilot 3A,over_15_years,2020-12-28 07:35:45,2020-12-28 07:33:19,Fluent
2597,Turkish,5872835237,a_and_b_are_not_related,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,176.88.68.107,a_and_b_are_not_related,0,1,a_and_b_are_not_related;0,Pilot 3A,over_15_years,2020-12-28 10:17:44,2020-12-28 10:12:23,Fluent
2598,Turkish,5872944308,a_and_b_have_the_same_meaning,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,124.106.180.38,a_and_b_are_not_related,0,0,a_and_b_are_not_related;0,GT,,NaT,NaT,GT


In [35]:
pilot_var_selected

'Pilot 3A'

In [36]:
rcR

Unnamed: 0,Language,_id,question_no_1,question_no_2,question_no_3,question_no_4,question_no_5,changes,complexity,familiarity,...,Question 3 Skill tested,Question 4 Skill tested,Question 5 Skill tested,Grouping,31_language_1,survey_created_at,survey_started_at,Fluency,Time_Taken_Seconds,Time_Taken_Minutes_Overall
0,Turkish,5868340900,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-23 22:01:18,2020-12-23 21:58:15,Fluent,160,26.983333
1,Turkish,5868379114,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-23 21:52:09,2020-12-23 21:47:05,Fluent,291,47.366667
2,Turkish,5868379132,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-23 22:11:33,2020-12-23 22:07:06,Fluent,291,46.983333
3,Turkish,5868424995,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,synthesis and decision-making,0,,GT,,NaT,NaT,GT,122,20.533333
4,Turkish,5868942308,a,c,b,,,"two grammatical errors have been corrected ""ıs...",straightforward,familiar,...,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-24 07:35:27,2020-12-24 07:26:44,Fluent,165,32.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Turkish,5871995545,b,c,b,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-27 21:24:30,2020-12-27 21:23:00,Fluent,380,27.666667
204,Turkish,5872385930,b,b,a,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-28 07:35:45,2020-12-28 07:33:19,Fluent,413,46.766667
205,Turkish,5872888505,b,b,a,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,synthesis and decision-making,0,,Pilot 3A,over_15_years,2020-12-28 10:17:44,2020-12-28 10:12:23,Fluent,678,51.766667
206,Turkish,5872947603,b,b,b,,,"punctuation errors: ""...inşa etti, çünkü..."" -...",complex,unfamiliar,...,synthesis and decision-making,0,,GT,,NaT,NaT,GT,177,24.716667


In [37]:
v1R

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register,difficulty,familiarity,question_,...,_ip,Answer,Score,Grouping,31_language_1,survey_created_at,survey_started_at,Fluency,Time_Taken_Seconds,Time_Taken_Minutes_Overall
0,Turkish,5868295106,yes,season,neutral,season,neutral,easy,familiar,1,...,159.146.43.95,yes,1,Pilot 3A,over_15_years,2020-12-23 22:01:18,2020-12-23 21:58:15,Fluent,20,8.233333
1,Turkish,5868300856,yes,season,neutral,season,neutral,easy,familiar,1,...,107.150.95.12,yes,1,Pilot 3A,over_15_years,2020-12-23 22:11:33,2020-12-23 22:07:06,Fluent,36,22.250000
2,Turkish,5868300866,yes,season,neutral,season,neutral,easy,familiar,1,...,107.150.95.12,yes,1,Pilot 3A,over_15_years,2020-12-23 21:52:09,2020-12-23 21:47:05,Fluent,21,23.566667
3,Turkish,5868359497,yes,season,neutral,season,neutral,easy,familiar,1,...,124.106.180.38,yes,1,GT,,NaT,NaT,GT,37,7.716667
4,Turkish,5868406548,yes,season,neutral,season,neutral,easy,familiar,1,...,101.179.219.27,yes,1,GT,,NaT,NaT,GT,34,8.183333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,Turkish,5871973241,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,88.230.169.32,no,1,Pilot 3A,over_15_years,2020-12-27 21:24:30,2020-12-27 21:23:00,Fluent,4,8.766667
1114,Turkish,5872312585,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,46.155.64.92,no,1,Pilot 3A,over_15_years,2020-12-28 07:35:45,2020-12-28 07:33:19,Fluent,10,10.350000
1115,Turkish,5872622491,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,176.88.68.107,no,1,Pilot 3A,over_15_years,2020-12-28 10:17:44,2020-12-28 10:12:23,Fluent,9,11.283333
1116,Turkish,5872921698,no,idioms,neutral,idioms,neutral,hard,unfamiliar,43,...,124.106.180.38,no,1,GT,,NaT,NaT,GT,9,7.716667


In [38]:
v2R

Unnamed: 0,Language,_id,rater_answer,a_domain,a_register,b_domain,b_register,difficulty,familiarity,question_,...,Alternate Answer,Score,Answers,Grouping,31_language_1,survey_created_at,survey_started_at,Fluency,Time_Taken_Seconds,Time_Taken_Minutes_Overall
0,Turkish,5868301063,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-23 22:01:18,2020-12-23 21:58:15,Fluent,46,30.383333
1,Turkish,5868344334,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-23 22:11:33,2020-12-23 22:07:06,Fluent,4,29.250000
2,Turkish,5868344361,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-23 21:52:09,2020-12-23 21:47:05,Fluent,10,31.450000
3,Turkish,5868410262,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,0,1,a_and_b_have_the_same_meaning;0,GT,,NaT,NaT,GT,19,23.016667
4,Turkish,5868780952,a_and_b_have_the_same_meaning,time,formal,time,neutral,easy,familiar,1,...,0,1,a_and_b_have_the_same_meaning;0,Pilot 3A,over_15_years,2020-12-24 07:35:27,2020-12-24 07:26:44,Fluent,11,23.516667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,Turkish,5871990756,a_and_b_are_not_related,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,0,1,a_and_b_are_not_related;0,Pilot 3A,over_15_years,2020-12-27 21:24:30,2020-12-27 21:23:00,Fluent,9,27.883333
2596,Turkish,5872365862,a_and_b_are_not_related,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,0,1,a_and_b_are_not_related;0,Pilot 3A,over_15_years,2020-12-28 07:35:45,2020-12-28 07:33:19,Fluent,51,32.600000
2597,Turkish,5872835237,a_and_b_are_not_related,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,0,1,a_and_b_are_not_related;0,Pilot 3A,over_15_years,2020-12-28 10:17:44,2020-12-28 10:12:23,Fluent,28,37.183333
2598,Turkish,5872944308,a_and_b_have_the_same_meaning,communication,slang/informal,physical activity,neutral,easy,familiar,100,...,0,0,a_and_b_are_not_related;0,GT,,NaT,NaT,GT,8,22.150000


In [39]:
list_of_datasets

{'Near Exact Match':    Language Fluency    _unit_id  question_                     a_domain  \
 0   Turkish  Fluent  2914019180          6                architectural   
 1   Turkish  Fluent  2914019177          3                architectural   
 2   Turkish  Fluent  2914019183          9                       nature   
 3   Turkish  Fluent  2914019185         11            Literature-cinema   
 4   Turkish  Fluent  2914019200         26                   night life   
 5   Turkish  Fluent  2914019198         24                         verb   
 6   Turkish  Fluent  2914019179          5                       idioms   
 7   Turkish  Fluent  2914019196         22                      politic   
 8   Turkish  Fluent  2914019209         35                  house items   
 9   Turkish  Fluent  2914019184         10                       idioms   
 10  Turkish  Fluent  2914019207         33                general words   
 11  Turkish  Fluent  2914019215         41                       id

In [40]:
list_of_summaries

{'Pilot 3_rc':     Language         _id question_no_1 question_no_2 question_no_3  \
 0    Turkish  5868340900             a             c             b   
 1    Turkish  5868379114             a             c             b   
 2    Turkish  5868379132             a             c             b   
 3    Turkish  5868424995             a             c             b   
 4    Turkish  5868942308             a             c             b   
 ..       ...         ...           ...           ...           ...   
 203  Turkish  5871995545             b             c             b   
 204  Turkish  5872385930             b             b             a   
 205  Turkish  5872888505             b             b             a   
 206  Turkish  5872947603             b             b             b   
 207  Turkish  5873505835             b             b             a   
 
      question_no_4  question_no_5  \
 0              NaN            NaN   
 1              NaN            NaN   
 2              Na