In this notebook, we take the extracted post test information, and we clean it such that: \
    - DVET entries are cleaned out \
    - missing rankings are taken away

In [1]:
import os 
import re
import pickle

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Data

In [2]:
with open('../../data/post_test/extracted.pkl', 'rb') as fp:
    post = pickle.load(fp)
    post['ranking'] = post['ranking'].fillna('missing')
    post['status'] = post['status'].astype(str).replace("KeyError('", '').replace("')", '')
    post['status'] = post['status'].astype(str).replace("'", '')
    
unique_rankings = post['ranking'].unique()
(unique_rankings)

array(['2130', '3120', '3201', '3012', '2103', '0123', 'missing', '0231',
       '2031', '3021', '2013', '3210', '1320', '2301', '0321',
       'wrong field', '0312', '0213', '1032', '1023', '2310', '3102',
       '1230', '1203', '1302', '0132', ''], dtype=object)

In [3]:
unique_rankings = [
    '2130', '3120', '3201', '3012', '2103', '0123', '0231',
    '2031', '3021', '2013', '3210', '1320', '2301', '0321', 
    '0312', '0213', '1032', '1023', '2310', '3102',
    '1230', '1203', '1302', '0132'
]

# Clean

## Status
Filters based on completion of data only

In [4]:
data_len = len(post)
print('length of the dataframe: {}, unique usernames: {}'.format(data_len, len(post['username'].unique())))

# take out useless entries
post = post[post['status'] != '0']
post = post[post['status'] != '2']
post = post[post['status'] != '3']
post = post[post['status'] != '4'] # got ranking time but nothing else
post = post[(post['status'] != '5') | (post['ranking'] != '')]
print('There were {} entries without any information'.format(data_len - len(post)))

# Partial data
post['numerical_status'] = post['status'].replace('complete', 18).replace('feedback', 16)
partial_rankings = post[(post['numerical_status'].astype(int) >= 6) & (post['numerical_status'].astype(int) < 18) & (post['ranking'].isin(unique_rankings))]
partial_questions = post[(post['numerical_status'].astype(int) >= 6) & (post['numerical_status'].astype(int) < 18) & ~(post['ranking'].isin(unique_rankings))]
print('There are {} entries who did not finish the test but did fill in the ranking questions'.format(len(partial_rankings)))
print('There are {} entries who did not finish the test and did not fill in the ranking questions'.format(len(partial_questions)))

# Complete data
complete_rankings = post[(post['numerical_status'].astype(int) == 18) & (post['ranking'].isin(unique_rankings))]
complete_questions = post[(post['numerical_status'].astype(int) == 18) & ~(post['ranking'].isin(unique_rankings))]
print('There are {} entries finished the test but did not fill in the ranking questions'.format(len(complete_questions)))
print('There are {} entries finished the test and did fill in the ranking questions'.format(len(complete_rankings)))

length of the dataframe: 518, unique usernames: 518
There were 23 entries without any information
There are 51 entries who did not finish the test but did fill in the ranking questions
There are 7 entries who did not finish the test and did not fill in the ranking questions
There are 56 entries finished the test but did not fill in the ranking questions
There are 371 entries finished the test and did fill in the ranking questions


In [5]:
questions = complete_rankings.append(complete_questions)
rankings = partial_rankings.append(complete_rankings)
some_questions = partial_questions.append(questions)
some = some_questions.append(rankings)
some = some.drop_duplicates('username')
print('There are {} entries for which the post test is complete.'.format(len(questions)))
print('There are {} entries for which the rankings has been completed'.format(len(rankings)))
print('There are {} entries for which at least one question has been answered'.format(len(some_questions)))
print('There are {} entries for which at least one question or the ranking has been answered'.format(len(some)))

There are 427 entries for which the post test is complete.
There are 422 entries for which the rankings has been completed
There are 434 entries for which at least one question has been answered
There are 485 entries for which at least one question or the ranking has been answered


In [6]:
with open('../../data/post_test/questions.pkl', 'wb') as fp:
    pickle.dump(questions, fp)
    # full questions
    
with open('../../data/post_test/rankings.pkl', 'wb') as fp:
    pickle.dump(rankings, fp)
    # rankings completed
    
with open('../../data/post_test/some_questions.pkl', 'wb') as fp:
    pickle.dump(some_questions, fp)
    # partial questions
    
with open('../../data/post_test/some.pkl', 'wb') as fp:
    pickle.dump(some, fp)
    # partial questions or ranking