This notebook is used to merge the old annotations with the updated dataframes.
Once this is done, we can extract the non annotated data, put them into a new dataframe, relabel them, and remerge them in this notebook to.

In [1]:
import os
import yaml
import pickle

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Post Test Data

In [2]:
# Post test
with open('../../data/post_test/questions.pkl', 'rb') as fp:
    questions = pickle.load(fp)
    
with open('../../data/post_test/rankings.pkl', 'rb') as fp:
    rankings = pickle.load(fp)
    
with open('../../data/post_test/some_questions.pkl', 'rb') as fp:
    some_questions = pickle.load(fp)
    
with open('../../data/post_test/some.pkl', 'rb') as fp:
    some = pickle.load(fp)

# First iteration of labelling
Here, we will merge the iteration data for the three first versions.

## Question merging

In [3]:
annotation_root = '../../data/post_test/annotations/'

### question 7
**Width is halved** \
Answer: 0.37 

In [4]:
seven = pd.read_csv(annotation_root + 'q1.csv', sep='\t', index_col=0)
seven = seven.drop('index', axis=1)
seven = seven.rename(columns={'label_seven': 'q1_label', 'flag_label_seven': 'q1_flag'})

In [5]:
some_questions = some_questions.merge(
    seven, on=['consent', 'field', 'gender', 'language', 'level', 'username', 'year', 'q1'],
    how='left'
)

### question 8
**Concentration is tripled**\
answer: 1.59

In [6]:
eight = pd.read_csv(annotation_root + 'q2.csv', sep='\t', index_col=0)
eight = eight.drop('index', axis=1)
eight = eight.rename(columns={'label_eight':'q2_label', 'flag_label_eight': 'q2_flag'})

In [7]:
some_questions = some_questions.merge(
    eight, on=['consent', 'field', 'gender', 'language', 'level', 'username', 'year', 'q2'],
    how='left'
)

### question 9
**Width is halved, Concentration is doubled** \
answer: 0.96

In [8]:
nine = pd.read_csv(annotation_root + 'q3.csv', sep='\t', index_col=0)
nine = nine.drop('index', axis=1)
nine = nine.rename(columns={'label_nine':'q3_label', 'flag_label_q3':'q3_flag'})

In [9]:
some_questions = some_questions.merge(
    nine, on=['consent', 'field', 'gender', 'language', 'level', 'username', 'year', 'q3'],
    how='left'
)

### Question 10
**Width is quadrupled, concentration is thirded(?)** \
answer: 0.8


In [10]:
ten = pd.read_csv(annotation_root + 'q4.csv', sep='\t', index_col=0)
ten = ten.drop('index', axis=1)
ten = ten.rename(columns={'label_ten': 'q4_label', 'flag_label_q4': 'q4_flag'})

In [11]:
some_questions = some_questions.merge(
    ten, on=['consent', 'field', 'gender', 'language', 'level', 'username', 'year', 'q4'],
    how='left'
)

## Extract non labelled entries

In [12]:
some_questions = some_questions.fillna('non-labelled')

In [13]:
non_labelled = some_questions[some_questions['q1'] == 'non-labelled']

In [14]:
non_labelled.to_csv('../../data/post_test/non_labelled.csv', sep='\t')

In [15]:
with open('../../data/post_test/non_labelled.pkl', 'wb') as fp:
    pickle.dump(non_labelled, fp)

## Remerge

In [16]:
# Post test
with open('../../data/post_test/questions.pkl', 'rb') as fp:
    questions = pickle.load(fp)

with open('../../data/post_test/rankings.pkl', 'rb') as fp:
    rankings = pickle.load(fp)
    
with open('../../data/post_test/some_questions.pkl', 'rb') as fp:
    some_questions = pickle.load(fp)

In [17]:
merging_columns = [
    'consent', 'date', 'field', 'gender', 'language', 'level', 'username', 'year'
]

### question 7
**Width is halved** \
Answer: 0.37 

In [11]:
q1 = pd.read_csv(annotation_root + 'q1.csv', sep='\t', index_col=0)
q1 = q1[merging_columns + ['q1', 'label_seven', 'flag_label_seven']]

seven = seven[merging_columns + ['seven', 'label_seven']]
seven = seven.rename(columns={'seven': 'q1'})

q1 = q1.append(seven)
q1 = q1.rename(columns={'label_seven': 'label_q1', 'flag_label_seven': 'flag_q1'})

In [12]:
q1.head()

Unnamed: 0,consent,date,field,gender,language,level,username,year,q1,label_q1,flag_q1
0,1.0,non-labelled,non-labelled,2.0,Français,non-labelled,4jchqv4d,1st,0.37,correct,
1,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u6c3cp6c,1st,0.37,correct,
2,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,m4zvafhs,1st,une absorbance plus grande que celle de gauche,larger,
3,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u25vrfyy,1st,1.48,inverse,
4,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,c9dj36r7,1st,0.37,correct,


### question 8
**Concentration is tripled**\
answer: 1.59

In [13]:
q2 = pd.read_csv(annotation_root + 'q2.csv', sep='\t', index_col=0)
q2 = q2[merging_columns + ['q2', 'label_eight', 'flag_label_eight']]

eight = eight[merging_columns + ['eight', 'label_eight']]
eight = eight.rename(columns={'eight': 'q2'})

q2 = q2.append(eight)
q2 = q2.rename(columns={'label_eight': 'label_q2', 'flag_label_eight': 'flag_q2'})

In [14]:
q2.head()

Unnamed: 0,consent,date,field,gender,language,level,username,year,q2,label_q2,flag_q2
0,1.0,non-labelled,non-labelled,2.0,Français,non-labelled,4jchqv4d,1st,1.59,correct,
1,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u6c3cp6c,1st,1.59,correct,
2,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,m4zvafhs,1st,plus grande que celle de gauche,larger,
3,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,u25vrfyy,1st,0.18,smaller,
4,1.0,non-labelled,non-labelled,1.0,Français,non-labelled,c9dj36r7,1st,0.53/3,inverse,


### question 9
**Width is halved, Concentration is doubled** \
answer: 0.96

In [15]:
q3 = pd.read_csv(annotation_root + 'q3.csv', sep='\t', index_col=0)
q3 = q3[merging_columns + ['q3', 'label_nine', 'flag_label_nine']]

nine = nine[merging_columns + ['nine', 'label_nine']]
nine = nine.rename(columns={'nine': 'q3'})

q3 = q3.append(nine)
q3 = q3.rename(columns={'label_nine': 'label_q3', 'flag_label_nine': 'flag_q3'})

### Question 10
**Width is quadrupled, concentration is thirded(?)** \
answer: 0.8

In [15]:
q4 = pd.read_csv(annotation_root + 'q4.csv', sep='\t', index_col=0)
q4 = q4[merging_columns + ['q4', 'label_ten', 'flag_label_ten']]

ten = ten[merging_columns + ['ten', 'label_ten']]
ten = ten.rename(columns={'ten': 'q4'})

q4 = q4.append(nine)
q4 = q4.rename(columns={'label_ten': 'label_q4', 'flag_label_ten': 'flag_q4'})

KeyError: "['label_ten', 'flag_label_ten'] not in index"

### Merge with some questions

In [16]:
merging_columns = [
    'consent', 'field', 'gender', 'language', 'level', 'username', 'year'
]

In [31]:
some = some.merge(q1, on=merging_columns + ['q1'], how='left')
some = some.merge(q2, on=merging_columns + ['q2'], how='left')
some = some.merge(q3, on=merging_columns + ['q3'], how='left')
some = some.merge(q4, on=merging_columns + ['q4'], how='left')

# Second iteration of labelling 

## Merge *some* with annotations

In [4]:
prior = pd.read_csv(annotation_root + 'prior.tsv', sep='\t', index_col=0)

In [5]:
merging_columns = [
    'field', 'gender', 'language', 'level', 'username', 'year', 'consent'
]

# Read data
q1 = pd.read_csv(annotation_root + 'q1.csv', sep='\t', index_col=0)
q2 = pd.read_csv(annotation_root + 'q2.csv', sep='\t', index_col=0)
q3 = pd.read_csv(annotation_root + 'q3.csv', sep='\t', index_col=0)
q4 = pd.read_csv(annotation_root + 'q4.csv', sep='\t', index_col=0)
prior = pd.read_csv(annotation_root + 'prior.tsv', sep='\t', index_col=0)

# Process each of the annotation files
q1 = q1.set_index('index')
q2 = q2.set_index('index')
q3 = q3.set_index('index')
q4 = q4.set_index('index')

def process_prior(row):
    if row['decisions'] == 'v':
        prior = row['Jade granular']
    else:
        prior = row['decisions']
    prior = prior.replace('’', "'").replace(' ,', ',').replace(', ', ',')
    return prior
prior = prior.set_index('index')
prior['prior_knowledge'] = prior.apply(process_prior, axis=1)
prior = prior[['username', 'gender', 'field', 'prior_knowledge']]

q1 = q1.rename(columns={'label_seven': 'q1_label', 'flag_label_seven': 'q1_flag'})
q2 = q2.rename(columns={'label_eight': 'q2_label', 'flag_q2_label': 'q2_flag'})
q3 = q3.rename(columns={'label_q3': 'q3_label', 'flag_label_q3': 'q3_flag'})
q4 = q4.rename(columns={'label_q4': 'q4_label', 'flag_label_q4': 'q4_flag'})

In [6]:
some = some.merge(q1, on=merging_columns + ['q1'], how='left')
some = some.merge(q2, on=merging_columns + ['q2'], how='left')
some = some.merge(q3, on=merging_columns + ['q3'], how='left')
some = some.merge(q4, on=merging_columns + ['q4'], how='left')
some = some.merge(prior, on=['username', 'gender', 'field'], how='left')

## Merge *some* with the other dataframes

In [7]:
some_columns = [
    'q1', 'q1_label', 'q1_flag', 
    'q2', 'q2_label', 'q2_flag',
    'q3', 'q3_label', 'q3_flag',
    'q4', 'q4_label', 'q4_flag',
    'prior_knowledge'
]

dataframe_merging_columns = merging_columns + ['q1', 'q2', 'q3', 'q4']
some_reduced = some[merging_columns + some_columns]

In [8]:
some_questions = some_questions.merge(some_reduced, on=dataframe_merging_columns, how='left')
questions = questions.merge(some_reduced, on=dataframe_merging_columns, how='left')
rankings = rankings.merge(some_reduced, on=dataframe_merging_columns, how='left')

In [9]:
with open('../../data/post_test/some_annotated.pkl', 'wb') as fp:
    pickle.dump(some, fp)

with open('../../data/post_test/some_questions_annotated.pkl', 'wb') as fp:
    pickle.dump(some_questions, fp)
    
with open('../../data/post_test/questions_annotated.pkl', 'wb') as fp:
    pickle.dump(questions, fp)
    
with open('../../data/post_test/rankings_annotated.pkl', 'wb') as fp:
    pickle.dump(rankings, fp)