This notebook is used to merge the old annotations with the updated dataframes.
Once this is done, we can extract the non annotated data, put them into a new dataframe, relabel them, and remerge them in this notebook to.

In [1]:
import os
import yaml
import pickle

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Post Test Data

In [2]:
# Post test
with open('../../data/post_test/questions.pkl', 'rb') as fp:
    questions = pickle.load(fp)
    
with open('../../data/post_test/rankings.pkl', 'rb') as fp:
    rankings = pickle.load(fp)
    
with open('../../data/post_test/some_questions.pkl', 'rb') as fp:
    some_questions = pickle.load(fp)
    
with open('../../data/post_test/some.pkl', 'rb') as fp:
    some = pickle.load(fp)

# First iteration of labelling
Here, we will merge the iteration data for the three first versions.

## Question merging

In [3]:
annotation_root = '../../data/post_test/annotations/'

### question 7
**Width is halved** \
Answer: 0.37 

In [22]:
seven = pd.read_csv(annotation_root + 'seven.csv', sep='\t', index_col=0)
seven.columns = [
    'consent', 'date', 'field', 'gender', 'language', 'level', 'username', 'year', 'seven', 'label_seven'
]

In [5]:
some_questions = some_questions.merge(
    seven, on=['consent', 'field', 'gender', 'language', 'level', 'username', 'year'],
    how='left'
)

### question 8
**Concentration is tripled**\
answer: 1.59

In [5]:
eight = pd.read_csv(annotation_root + 'eight.csv', sep='\t', index_col=0)
eight.columns = [
    'consent', 'date', 'field', 'gender', 'language', 'level', 'username', 'year', 'eight', 'label_eight'
]

In [7]:
some_questions = some_questions.merge(
    eight, on=['date', 'consent', 'field', 'gender', 'language', 'level', 'username', 'year'],
    how='left'
)

### question 9
**Width is halved, Concentration is doubled** \
answer: 0.96

In [7]:
nine = pd.read_csv(annotation_root + 'nine.csv', sep='\t', index_col=0)
nine.columns = [
    'consent', 'date', 'field', 'gender', 'language', 'level', 'username', 'year', 'nine', 'label_nine'
]

In [9]:
some_questions = some_questions.merge(
    nine, on=['date', 'consent', 'field', 'gender', 'language', 'level', 'username', 'year'],
    how='left'
)

### Question 10
**Width is quadrupled, concentration is thirded(?)** \
answer: 0.8


In [8]:
ten = pd.read_csv(annotation_root + 'ten.csv', sep='\t', index_col=0)
ten.columns = [
    'consent', 'date', 'field', 'gender', 'language', 'level', 'username', 'year', 'ten', 'label_ten'
]

In [11]:
some_questions = some_questions.merge(
    ten, on=['date', 'consent', 'field', 'gender', 'language', 'level', 'username', 'year'],
    how='left'
)

## Extract non labelled entries

In [12]:
some_questions = some_questions.fillna('non-labelled')

In [13]:
non_labelled = some_questions[some_questions['label_seven'] == 'non-labelled']

In [14]:
non_labelled.to_csv('../../data/post_test/non_labelled.csv', sep='\t')

In [15]:
with open('../../data/post_test/non_labelled.pkl', 'wb') as fp:
    pickle.dump(non_labelled, fp)

In [16]:
non_labelled.head()

Unnamed: 0,username,start_time,exploration_time,ranking_task_time,ranking,ranking_confidence,ranking_time,q1,q1_conf,q1_time,q2,q2_conf,q2_time,q3,q3_conf,q3_time,q4,q4_conf,q4_time,q5_colour0,q5_colour1,q5_colour2,q5_colour3,q5_time,q6_colour0,q6_colour1,q6_colour2,q6_colour3,q6_time,q7_colour0,q7_colour1,q7_colour2,q7_colour3,q7_time,q8_colour0,q8_colour1,q8_colour2,q8_colour3,q8_time,formula,fomula_conf,formula_time,notes_text,notes_math,notes_table,notes_diag,notes_time,beerslaw,beerslaw_conf,beerslaw_time,problem_ranking,problem_conf,problem_time,feedback_entertain,feedback_difficult,feedback_good,feedback_bad,feedback,feedback_time,experiment_time,status,session,version,numerical_status,language,field,level,year,consent,gender,date,seven,label_seven,eight,label_eight,nine,label_nine,ten,label_ten
213,4jchqv4d,{'time': 1634035392},{'time': 1634035547},{'time': 1634036101},1302,45,1634036126,0.37,0,1634036317,1.59,50,1634036380,0.48,5,1634036583,0.8,90,1634036946,5,50,15,30,1634037030,5,30,15,50,1634037067,10,20,30,40,1634037084,20,20,45,15,1634037105,"Selon la lois de Lambert Beer, \r\nA= E * c * ...",100,1634037276,,,,,1634037281,oui bien sur,,1634037293,"[Co(No3)2: Cobalt (II) nitrate, CoCl2: Cobalt ...",wrong field,1634037720,50,50,les images,j'ai de la peine a faire des choses pratique c...,wrong field,1634037957,1634037973,complete,Session 40,4,18,Français,non-labelled,non-labelled,1st,1.0,2.0,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled
214,u6c3cp6c,{'time': 1634032422},{'time': 1634032514},{'time': 1634032765},2130,20,1634032782,0.37,25,1634032843,1.59,30,1634032866,0.96,20,1634032886,0.60,25,1634032899,50,50,0,0,1634032943,50,50,0,0,1634032959,30,70,0,0,1634032984,70,30,0,0,1634033049,concentration volume et plein d'autre truc,0,1634033107,,,,,1634033128,oui mais plus de souvenir,,1634033147,"[Co(No3)2: Cobalt (II) nitrate, CoCl2: Cobalt ...",wrong field,1634034108,50,100,la parti labo,manque de clarté pour les exercices pas de pos...,wrong field,1634034333,1634034340,complete,Session 40,4,18,Français,non-labelled,non-labelled,1st,1.0,1.0,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled
215,m4zvafhs,{'time': 1634032088},{'time': 1634032732},{'time': 1634033167},213,30,1634033197,une absorbance plus grande que celle de gauche,70,1634033263,plus grande que celle de gauche,55,1634033284,plus petite que celle de gauche\r\n,15,1634033307,plus petite que celle de gauche,5,1634033331,70,10,0,20,1634033397,100,0,0,0,1634033423,5,0,25,70,1634033451,35,60,5,0,1634033496,aucune idée,15,1634033545,,,,,1634033555,oui,,1634033562,"[Je ne sais pas, Je ne sais pas, Je ne sais pas]",wrong field,1634033583,5,55,-,-,wrong field,1634033600,1634033602,complete,Session 40,4,18,Français,non-labelled,non-labelled,1st,1.0,1.0,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled
216,u25vrfyy,{'time': 1634035362},{'time': 1634035527},{'time': 1634035904},3102,100,1634035951,1.48,50,1634036008,0.18,50,1634036052,0.96,50,1634036102,0.45,50,1634036141,70,0,0,30,1634036218,20,0,0,80,1634036243,65,0,0,35,1634036281,0,100,0,0,1634036322,"Plus la concentration sera élevée, plus la sol...",75,1634036762,,,,,1634036852,oui,,1634036896,"[Co(No3)2: Cobalt (II) nitrate, CoCl2: Cobalt ...",wrong field,1634037469,50,50,Les images,Les images,wrong field,1634037718,1634037801,complete,Session 40,4,18,Français,non-labelled,non-labelled,1st,1.0,1.0,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled
217,c9dj36r7,{'time': 1634035417},{'time': 1634035654},{'time': 1634036366},3012,0,1634036405,0.37,0,1634036474,0.53/3,0,1634036513,0.96,0,1634036533,0.8,0,1634036602,75,0,0,25,1634036662,0,50,50,0,1634036697,0,100,0,0,1634036710,25,25,25,25,1634036757,A=c*E*d\r\nA[-]\r\nc[mol/l]\r\nE[oui]\r\nd[dm],0,1634036868,,,,,1634036891,non,,1634036905,"[KMnO4: Potassium permanganate, KMnO4: Potassi...",wrong field,1634037340,0,100,.,.,wrong field,1634037375,1634037383,complete,Session 40,4,18,Français,non-labelled,non-labelled,1st,1.0,1.0,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled,non-labelled


## Remerge

In [9]:
# Post test
with open('../../data/post_test/questions.pkl', 'rb') as fp:
    questions = pickle.load(fp)

with open('../../data/post_test/rankings.pkl', 'rb') as fp:
    rankings = pickle.load(fp)
    
with open('../../data/post_test/some_questions.pkl', 'rb') as fp:
    some_questions = pickle.load(fp)

In [10]:
merging_columns = [
    'consent', 'date', 'field', 'gender', 'language', 'level', 'username', 'year'
]

### question 7
**Width is halved** \
Answer: 0.37 

In [11]:
q1 = pd.read_csv(annotation_root + 'q1.csv', sep='\t', index_col=0)
q1 = q1[merging_columns + ['q1', 'label_seven', 'flag_label_seven']]

seven = seven[merging_columns + ['seven', 'label_seven']]
seven = seven.rename(columns={'seven': 'q1'})

q1 = q1.append(seven)
q1 = q1.rename(columns={'label_seven': 'label_q1', 'flag_label_seven': 'flag_q1'})

In [12]:
q1.head()

Unnamed: 0,consent,date,field,gender,language,level,username,year,q1,label_q1,flag_q1
0,1.0,non-labelled,non-labelled,2.0,Français,non-labelled,4jchqv4d,1st,0.37,correct,
1,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u6c3cp6c,1st,0.37,correct,
2,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,m4zvafhs,1st,une absorbance plus grande que celle de gauche,larger,
3,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u25vrfyy,1st,1.48,inverse,
4,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,c9dj36r7,1st,0.37,correct,


### question 8
**Concentration is tripled**\
answer: 1.59

In [13]:
q2 = pd.read_csv(annotation_root + 'q2.csv', sep='\t', index_col=0)
q2 = q2[merging_columns + ['q2', 'label_eight', 'flag_label_eight']]

eight = eight[merging_columns + ['eight', 'label_eight']]
eight = eight.rename(columns={'eight': 'q2'})

q2 = q2.append(eight)
q2 = q2.rename(columns={'label_eight': 'label_q2', 'flag_label_eight': 'flag_q2'})

In [14]:
q2.head()

Unnamed: 0,consent,date,field,gender,language,level,username,year,q2,label_q2,flag_q2
0,1.0,non-labelled,non-labelled,2.0,Français,non-labelled,4jchqv4d,1st,1.59,correct,
1,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u6c3cp6c,1st,1.59,correct,
2,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,m4zvafhs,1st,plus grande que celle de gauche,larger,
3,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u25vrfyy,1st,0.18,smaller,
4,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,c9dj36r7,1st,0.53/3,inverse,


### question 9
**Width is halved, Concentration is doubled** \
answer: 0.96

In [15]:
q3 = pd.read_csv(annotation_root + 'q3.csv', sep='\t', index_col=0)
q3 = q3[merging_columns + ['q3', 'label_nine', 'flag_label_nine']]

nine = nine[merging_columns + ['nine', 'label_nine']]
nine = nine.rename(columns={'nine': 'q3'})

q3 = q3.append(nine)
q3 = q3.rename(columns={'label_nine': 'label_q3', 'flag_label_nine': 'flag_q3'})

### Question 10
**Width is quadrupled, concentration is thirded(?)** \
answer: 0.8

In [16]:
q4 = pd.read_csv(annotation_root + 'q4.csv', sep='\t', index_col=0)
q4 = q4[merging_columns + ['q4', 'label_ten', 'flag_label_ten']]

ten = ten[merging_columns + ['ten', 'label_ten']]
ten = ten.rename(columns={'ten': 'q4'})

q4 = q4.append(nine)
q4 = q4.rename(columns={'label_ten': 'label_q4', 'flag_label_ten': 'flag_q4'})

### Merge with some questions

In [28]:
merging_columns = [
    'consent', 'field', 'gender', 'language', 'level', 'username', 'year'
]

In [31]:
some = some.merge(q1, on=merging_columns + ['q1'], how='left')
some = some.merge(q2, on=merging_columns + ['q2'], how='left')
some = some.merge(q3, on=merging_columns + ['q3'], how='left')
some = some.merge(q4, on=merging_columns + ['q4'], how='left')

# Second iteration of labelling 

## Merge *some* with annotations

In [4]:
q1 = pd.read_csv(annotation_root + 'q1.csv', sep='\t', index_col=0)
q2 = pd.read_csv(annotation_root + 'q2.csv', sep='\t', index_col=0)
q3 = pd.read_csv(annotation_root + 'q3.csv', sep='\t', index_col=0)
q4 = pd.read_csv(annotation_root + 'q4.csv', sep='\t', index_col=0)

q1 = q1.set_index('index')
q2 = q2.set_index('index')
q3 = q3.set_index('index')
q4 = q4.set_index('index')

q1 = q1.rename(columns={'label_seven': 'label_q1', 'flag_label_seven': 'flag_q1_label'})

In [5]:
merging_columns = [
    'field', 'gender', 'language', 'level', 'username', 'year', 'consent'
]

In [6]:
some = some.merge(q1, on=merging_columns + ['q1'], how='left')
some = some.merge(q2, on=merging_columns + ['q2'], how='left')
some = some.merge(q3, on=merging_columns + ['q3'], how='left')
some = some.merge(q4, on=merging_columns + ['q4'], how='left')

## Merge *some* with the other dataframes

In [7]:
some_columns = [
    'q1', 'label_q1', 'flag_q1_label', 
    'q2', 'q2_label', 'flag_q2_label',
    'q3', 'label_q3', 'flag_label_q3',
    'q4', 'label_q4', 'flag_label_q4'
]

dataframe_merging_columns = merging_columns + ['q1', 'q2', 'q3', 'q4']
some_reduced = some[merging_columns + some_columns]

In [8]:
some_questions = some_questions.merge(some_reduced, on=dataframe_merging_columns, how='left')
questions = questions.merge(some_reduced, on=dataframe_merging_columns, how='left')
rankings = rankings.merge(some_reduced, on=dataframe_merging_columns, how='left')

In [9]:
with open('../../data/post_test/some_annotated.pkl', 'wb') as fp:
    pickle.dump(some, fp)

with open('../../data/post_test/some_questions_annotated.pkl', 'wb') as fp:
    pickle.dump(some_questions, fp)
    
with open('../../data/post_test/questions_annotated.pkl', 'wb') as fp:
    pickle.dump(questions, fp)
    
with open('../../data/post_test/rankings_annotated.pkl', 'wb') as fp:
    pickle.dump(rankings, fp)