In [None]:
%reset

###################################################################       
#Script Name    :                                                                                              
#Description    :                                                                                 
#Args           :                                                                                           
#Author         : Nikhil Rao in R, converted to Python by Nor Raymond                                              
#Email          : nraymond@appen.com                                          
###################################################################

In [4]:
import os
import glob 
import pandas as pd
import numpy as np
import yaml
import warnings
warnings.filterwarnings("ignore")

# Function to load yaml configuration file
def load_config(config_name):
    with open(os.path.join(config_path, config_name), 'r') as file:
        config = yaml.safe_load(file)

    return config

config_path = "conf/base"

try:
    
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
except:
    
    os.chdir('..')
    # load yaml catalog configuration file
    config = load_config("catalog.yml")

    os.chdir(config["project_path"])
    root_path = os.getcwd()
    
# import data_processing module
import src.data.data_processing as data_processing

In [5]:
# import data from data_processing module
raters, r1, r2, r3, languages, rc, v1, v2 = data_processing.main()
languages

Unnamed: 0,Language
0,Russian
1,Hebrew
2,Indonesian
3,Chinese(Simplified)


In [7]:
def language_selection(languages):

    while True:
        try:
            language_index = int(input("\nPlease select the number of the Language you are assessing: "))
            if language_index < min(languages.index) or language_index > max(languages.index):
                print(f"\nYou must enter numbers between {min(languages.index)} - {max(languages.index)}... Please try again")
                continue
            elif language_index == "":
                print("\nYou must enter any numbers")
                continue
            else:
                print(f"\nYou have selected {language_index} for {languages.iloc[language_index, 0]}")
                language_selected = languages.iloc[language_index, 0]
                break

        except ValueError:
            print(f"\nYou must enter numerical values only... Please try again")
            continue
        else:
            break
            
    return language_selected
    
language_selected = language_selection(languages)


Please select the number of the Language you are assessing:  1



You have selected 1 for Hebrew


#### Functions for Language Modification - getting the overall time taken

In [60]:
# function for Language Modification
def get_time_taken(df, language_selected):

    # Filter data based on selected language
    dfr = df[df['Language'] == language_selected]

    # Time Taken by Item
    dfr["Time_Taken_Seconds"] = (dfr['_created_at'] - dfr['_started_at']).dt.seconds

    # Time Taken Overall
    dfr_grouped = dfr.groupby('_worker_id').sum('Time_Taken_Seconds')
    dfr_grouped["Time_Taken_Minutes_Overall"] = dfr_grouped["Time_Taken_Seconds"] / 60
    dfr_grouped = dfr_grouped.reset_index()
    dfr = pd.merge(dfr, dfr_grouped[["Time_Taken_Minutes_Overall", "_worker_id"]], how = 'left', on = '_worker_id')

    return dfr

def get_time_taken_all(language_selected, rc, v1, v2):
    
    df_list = [rc, v1, v2]
    keys = ["rcR", "v1R", "v2R"]
    df_time = {}
    
    for df, key in zip(df_list, keys) :

        dfr = get_time_taken(df, language_selected)
        df_time[key] = dfr

    rcR, v1R, v2R = df_time["rcR"], df_time["v1R"], df_time["v2R"]    
    
    return rcR, v1R, v2R

rcR, v1R, v2R = get_time_taken_all(language_selected, rc, v1, v2)

#### Functions for calculating Fail Rates

In [None]:
# # V1 Fail Rates -----------------------------------------------------------
# v1_actual_correct_by_question <- v1R %>%
#   ## Remove Tenure from Grouping
#   dplyr::select(Language, Market, `_worker_id`, `_unit_id`, question_, a_domain, a_register, wordphrase_a, b_domain, b_register, wordphrase_b, 
#                 difficulty, Answer, Score) %>%
#   group_by(Language, Market, `_unit_id`, question_, a_domain, a_register, wordphrase_a, b_domain, b_register, wordphrase_b, 
#            difficulty, Answer, Score) %>%
#   summarise(Count_of_Test_Takers = n()) %>% 
#   ungroup() %>%
#   group_by(Language, Market, `_unit_id`, question_, a_domain, a_register, wordphrase_a, b_domain, b_register, wordphrase_b, difficulty) %>% 
#   mutate(Total_Test_Takers = sum(Count_of_Test_Takers),
#          Fail_Rate = round((Count_of_Test_Takers / Total_Test_Takers), 2)) %>% 
#   ungroup() %>% 
#   group_by(Market, `_unit_id`, question_, a_domain, a_register, wordphrase_a, b_domain, b_register, wordphrase_b, difficulty) %>%
#   filter(Score %in% "0") %>%
#   ungroup() %>% 
#   arrange(Market, -Fail_Rate)

In [71]:
def v1_actual_correct_by_question(v1R):
    
    v1R_temp = v1R[['Language', 'Market', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                    'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']]

    v1R_grouped = v1R_temp.groupby(['Language', 'Market', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score'])['_worker_id'].count().reset_index()
    v1R_grouped = v1R_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
    
    v1R_grouped["Total_Test_Takers"] = v1R_grouped.groupby(['Language', 'Market', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                    'b_domain', 'b_register', 'wordphrase_b', 'difficulty'])["Count_of_Test_Takers"].transform('sum')   
    
    return v1R_grouped

v1R_grouped = v1_actual_correct_by_question(v1R)
v1R_grouped


Unnamed: 0,Language,Market,_unit_id,question_,a_domain,a_register,wordphrase_a,b_domain,b_register,wordphrase_b,difficulty,Answer,Score,Count_of_Test_Takers,Total_Test_Takers
0,Hebrew,IW-IW,2920269741,3,Mood,formal,גיל,Mood,neutral,שמחה,easy,yes,0,35,73
1,Hebrew,IW-IW,2920269741,3,Mood,formal,גיל,Mood,neutral,שמחה,easy,yes,1,38,73
2,Hebrew,IW-IW,2920269742,4,Shopping,slang/informal,סופר,Shopping,formal,מרכול,easy,yes,0,7,73
3,Hebrew,IW-IW,2920269742,4,Shopping,slang/informal,סופר,Shopping,formal,מרכול,easy,yes,1,66,73
4,Hebrew,IW-IW,2920269743,5,Finance,slang/informal,עודף,Finance,formal,כסף קטן,hard,yes,0,19,73
5,Hebrew,IW-IW,2920269743,5,Finance,slang/informal,עודף,Finance,formal,כסף קטן,hard,yes,1,54,73
6,Hebrew,IW-IW,2920269744,6,Food,neutral,אבטיח,Food,neutral,מלון,easy,no,0,2,73
7,Hebrew,IW-IW,2920269744,6,Food,neutral,אבטיח,Food,neutral,מלון,easy,no,1,71,73
8,Hebrew,IW-IW,2920269745,9,Restaurant,formal,רב-מלצרים,Restaurant,neutral,מלצרית,hard,no,0,15,73
9,Hebrew,IW-IW,2920269745,9,Restaurant,formal,רב-מלצרים,Restaurant,neutral,מלצרית,hard,no,1,58,73


#### Run all 

In [None]:
# def main():

#     file_initials = ['RC', 'Vocab_1', 'Vocab_2']

#     df_summary = obtain_file_summary_df(file_initials)
#     df_data = obtain_file_data_df(file_initials)
#     raters =  obtain_distinct_raters(df_summary)
#     rc, v1, v2 = merge_raters_to_df_data(df_data, raters)
    
#     return rc, v1, v2

# if __name__ == "__main__":

#     rc, v1, v2 = main()
#     print('Automated data processing completed.')

# OTHER

In [None]:
def v1_actual_correct_by_question(v1R):
    
    v1R_temp = v1R[['Language', 'Market', '_worker_id', '_unit_id', 'question_', 'a_domain', 'a_register', 
                    'wordphrase_a', 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score']]
    
    
    v1R_grouped = v1R_temp.groupby(['Language', 'Market', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
                                 'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score'])['_worker_id'].count().reset_index()
    
#     num_test_takers = v1R_temp.groupby(['Language', 'Market', '_unit_id', 'question_', 'a_domain', 'a_register', 'wordphrase_a', 
#                                  'b_domain', 'b_register', 'wordphrase_b', 'difficulty', 'Answer', 'Score'])['_worker_id'].transform('count')
    
    v1R_grouped = v1R_grouped.rename(columns = {"_worker_id" : "Count_of_Test_Takers"})
#     v1R_out = pd.merge(v1R_temp, v1R_grouped[["Count_of_Test_Takers"]], how = 'left', on = '_worker_id')
    
    #v1R_temp['Count_of_Test_Takers'] = num_test_takers
    
    return v1R_grouped

v1R_grouped = v1_actual_correct_by_question(v1R)
v1R_grouped