In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 5
plt.rcParams["figure.figsize"] = fig_size

In [2]:
t_data = pd.read_pickle('../workproduct-files/cleaned-dataframes/t_dataMaster.pkl')
similarity = pd.read_pickle('../workproduct-files/similarityMatrix/FullSimilarityComparison.pkl')

## Set cutoff - row-pairs with similarity ratio > cutoff are processed

In [3]:
# Cutoff determined by estimating at what point similar rows are likely no to be true duplicates
cutoff = 0.98

In [4]:
similarity = similarity.loc[similarity['similarityRatio'] > cutoff]

## Select rows to drop from data master

In [5]:
t_data.set_index('CONS_id', inplace = True)

In [6]:
cols_comparison_i = ['i-category', 'i - Has category', 'i - Is all title case', 'i - Ends in punctuation', 'i - nr of cap words', 'i - source']
cols_comparison_j = ['j-category', 'j - Has category', 'j - Is all title case', 'j - Ends in punctuation', 'j - nr of cap words', 'j - source']
cols_comparison_i.extend(cols_comparison_j)
cols_comparison = cols_comparison_i
dfColsComp = pd.DataFrame(columns = cols_comparison)

In [7]:
similarity = similarity.append(dfColsComp)

In [8]:
def countCaps(x):
    i = 0
    for c in x:      
        i = i + c.isupper()
    return i

In [9]:
similarity['i - source'] = similarity['i-id'].apply(lambda x: t_data.loc[x, 'Source'])
similarity['j - source'] = similarity['j-id'].apply(lambda x: t_data.loc[x, 'Source'])

In [10]:
similarity['i-category'] = similarity['i-id'].apply(lambda x: t_data.loc[x, 'CONS_category'])
similarity['j-category'] = similarity['j-id'].apply(lambda x: t_data.loc[x, 'CONS_category'])

In [11]:
similarity['i - Has category'] = similarity['i-id'].apply(lambda x: not t_data.loc[x, 'CONS_category'] == 'Uncategorized')
similarity['j - Has category'] = similarity['j-id'].apply(lambda x: not t_data.loc[x, 'CONS_category'] == 'Uncategorized')

In [12]:
similarity['i - Is all title case'] = similarity['i-id'].apply(lambda x: t_data.loc[x, 'CONS_question'].istitle())
similarity['j - Is all title case'] = similarity['j-id'].apply(lambda x: t_data.loc[x, 'CONS_question'].istitle())

In [13]:
similarity['i - Ends in punctuation'] = similarity['i-id'].apply(lambda x: t_data.loc[x, 'CONS_question'][-1] in ['?', '.', '"', "'"])
similarity['j - Ends in punctuation'] = similarity['j-id'].apply(lambda x: t_data.loc[x, 'CONS_question'][-1] in ['?', '.', '"', "'"])

In [14]:
similarity['i - nr of cap words'] = similarity['i-id'].apply(lambda x: countCaps(t_data.loc[x, 'CONS_question']) )
similarity['j - nr of cap words'] = similarity['j-id'].apply(lambda x: countCaps(t_data.loc[x, 'CONS_question']) )

### Requirements on which to choose duplicate to drop
- 1) If one from rtg, other from tdb -> change q and category from rtg to tdb, keep tdb
- 2) Has category -> keep
- 3) Is all title case -> drop
- 4) Ends in punctuation ? . ?" -> keep
- 5) Not title case but has more capitalized words than other -> keep

In [15]:
def idToKeep(x): # take row from similarity, return [id to keep, question if changed, category if changed]
   
    ret = {'idToKeep': np.nan, 'idToDrop': np.nan, 'changedQ': np.nan, 'changedCat': np.nan}

       
    # Keep not rtg and substitute with rtg data
    if x['i - source'] == 'rtg' and x['j - source'] != 'rtg':
        ret['idToKeep'] = x['j-id']
        ret['changedQ'] = x['i-question']
        ret['changedCat'] = x['i-category']
        #print('3')
    elif x['j - source'] == 'rtg' and x['i - source'] != 'rtg':
        ret['idToKeep'] = x['i-id']
        ret['changedQ'] = x['j-question']
        ret['changedCat'] = x['j-category']
        #print('4')
    
    # Keep the one that has a category
    elif int(x['i - Has category']) + int(x['j - Has category']) == 1:
        if x['i - Has category'] == True:
            ret['idToKeep'] = x['i-id']
            #print('1')
        if x['j - Has category'] == True:
            ret['idToKeep'] = x['j-id']
            #print('2')
        
    # Don't keep if is all title case
    elif int(x['i - Is all title case']) + int(x['j - Is all title case']) == 1:
        if x['i - Is all title case'] == True:
            ret['idToKeep'] = x['j-id']
            #print('5')
        if x['j - Is all title case'] == True:
            ret['idToKeep'] = x['i-id']
            #print('6')
    # Keep the one that ends in punctuation
    elif int(x['i - Ends in punctuation']) + int(x['j - Ends in punctuation']) == 1:
        if x['i - Ends in punctuation'] == True:
            ret['idToKeep'] = x['j-id']
            #print('7')
        if x['j - Ends in punctuation'] == True:
            ret['idToKeep'] = x['i-id']
            #print('8')
    # Keep the one that has more capital case words
    elif x['i - nr of cap words'] > x['j - nr of cap words']:
        ret['idToKeep'] = x['i-id']
        #print('9')
    elif x['j - nr of cap words'] > x['i - nr of cap words']:
        ret['idToKeep'] = x['j-id']
        #print('10')
    # Keep i as default, change to capitalized if necessary
    else:
        if ['i - Is all title case']:
            ret['changedQ'] = x['i-question'].capitalize()
            
        ret['idToKeep'] = x['i-id']
        #print('11')
    
    # Document id to drop
    if ret['idToKeep'] == x['i-id']:
        ret['idToDrop'] = x['j-id']
    else:
        ret['idToDrop'] = x['i-id']
    
    return ret
    

In [16]:
%%time
masterEdits = pd.DataFrame(columns = ['idToKeep', 'idToDrop', 'changedQ', 'changedCat'])

for row in similarity.index:
    masterEdits = masterEdits.append(idToKeep(similarity.loc[row]), ignore_index = True)

Wall time: 22.3 s


## Map info to t_data and update. Save t_data as new master file

In [17]:
t_data = t_data.append(pd.DataFrame(columns = ['Drop row', 'qUpdate_duplRemoved', 'catUpdate_duplRemoved', 'Duplicates_dropped']))

In [18]:
t_data.loc[masterEdits['idToDrop'], 'Drop row'] = True

In [19]:
# List dropped duplicate id's in t_data['Duplicates_dropped']
t_data.loc[masterEdits['idToKeep'], 'Duplicate_removed'] = t_data.loc[masterEdits['idToKeep']].index.map(lambda x: masterEdits.loc[masterEdits['idToKeep'] == x, 'idToDrop'].tolist())


In [20]:
# Save updated questions and categories
t_data.loc[masterEdits['idToKeep'], 'qUpdate'] = t_data.loc[masterEdits['idToKeep']].index.map(lambda x: masterEdits.loc[masterEdits['idToKeep'] == x, 'changedQ'].tolist())
t_data.loc[masterEdits['idToKeep'], 'catUpdate'] = t_data.loc[masterEdits['idToKeep']].index.map(lambda x: masterEdits.loc[masterEdits['idToKeep'] == x, 'changedCat'].tolist())

In [21]:
# Clean qUpdate and catUpdate from nans, and empty lists, remove Uncategorized from catUpdate
t_data.loc[t_data['qUpdate'].notnull(), 'qUpdate'] = t_data.loc[t_data['qUpdate'].notnull(), 'qUpdate'].apply(lambda x: [y for y in x if pd.notnull(y)])
t_data.loc[t_data['catUpdate'].notnull(), 'catUpdate'] = t_data.loc[t_data['catUpdate'].notnull(), 'catUpdate'].apply(lambda x: [y for y in x if pd.notnull(y)])
t_data.loc[t_data['catUpdate'].notnull(), 'catUpdate'] = t_data.loc[t_data['catUpdate'].notnull(), 'catUpdate'].apply(lambda x: [y for y in x if y != 'Uncategorized'])

t_data.loc[t_data['qUpdate'].str.len() == 0, 'qUpdate'] = np.nan
t_data.loc[t_data['catUpdate'].str.len() == 0, 'catUpdate'] = np.nan

In [22]:
# Save data master with duplicate info columns
t_data.to_pickle('../workproduct-files/similarityMatrix/t_dataMaster-withDuplicateInfo.pkl')

In [23]:
# Update master data question and category with new data
t_data.loc[t_data['qUpdate'].notnull(), 'CONS_question'] = t_data.loc[t_data['qUpdate'].notnull(), 'qUpdate'].apply(lambda x: x[0])
t_data.loc[t_data['catUpdate'].notnull(), 'CONS_category'] = t_data.loc[t_data['catUpdate'].notnull(), 'catUpdate'].apply(lambda x: x[0])

In [24]:
# Reset index and rename 'CONS_id' column
t_data.reset_index(inplace = True)
t_data.rename(columns = {'index': 'CONS_id'}, inplace = True)

In [25]:
t_data.drop(t_data.loc[t_data['Drop row'] == True].index, inplace = True)
t_data.drop(['Drop row', 'qUpdate_duplRemoved', 'catUpdate_duplRemoved', 'Duplicates_dropped', 'qUpdate', 'catUpdate'], axis = 1, inplace = True)
t_data.reset_index(drop = True, inplace = True)

In [26]:
# Save new data master
t_data.to_pickle('../workproduct-files/cleaned-dataframes/t_dataMaster-duplicatesRemoved.pkl')

In [27]:
t_data

Unnamed: 0,CONS_id,CONS_question,CONS_answer,CONS_alt answers,CONS_category,CONS_alt categories - NOT USED,CONS_type-formulation,CONS_type-multipleChoice,ORIG_id,ORIG_question,ORIG_answer,ORIG_alt answers,ORIG_category,ORIG_alt categories,ORIG_difficulty,ORIG_type,Source,Duplicate_removed
0,tdb_0x000000,"""Now is the winter of our discontent"" is a line from which Shakespearian play?",Richard III,"[Romeo and Juliet, Macbeth]",Art and literature,,Question,True,tdb_0x000000,"""Now is the winter of our discontent"" is a line from which Shakespearian play?",0,"[Richard III, Romeo and Juliet, Macbeth]",ART_AND_LITERATURE,,,,tdb,
1,tdb_0x000001,"""Our Town"" is a play by whom?",Thornton Wilder,,Art and literature,,Question,False,tdb_0x000001,"""Our Town"" is a play by whom?",0,[Thornton Wilder],ART_AND_LITERATURE,,,,tdb,
2,tdb_0x000002,"""The Diary of Anne Frank"" was first published in English under what title?",The diary of a young girl,,Art and literature,,Question,False,tdb_0x000002,"""The Diary of Anne Frank"" was first published in English under what title?",0,[The diary of a young girl],ART_AND_LITERATURE,,,,tdb,[tdb_0x006650]
3,tdb_0x000003,"A band of painted or sculpted decoration, often at the top of a wall.",A frieze,,Art and literature,,Statement - open,False,tdb_0x000003,"A band of painted or sculpted decoration, often at the top of a wall.",0,[A frieze],ART_AND_LITERATURE,,,,tdb,
4,tdb_0x000004,"A composition made of cut and pasted pieces of materials, sometimes with images added by the artist.",Collage,,Art and literature,,Statement - open,False,tdb_0x000004,"A composition made of cut and pasted pieces of materials, sometimes with images added by the artist.",0,[Collage],ART_AND_LITERATURE,,,,tdb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64534,rtg_0x005023,What was the original intended use for the RC car prior to being a toy?,War,,Uncategorized,,Question,False,rtg_0x005023,What was the original intended use for the RC car prior to being a toy?,War,,general,,,,rtg,
64535,rtg_0x005025,What was the name of the sacred river in Samuel Taylor Coleridge's Xanadu?,Alph,,Art and literature,,Question,False,rtg_0x005025,What was the name of the sacred river in Samuel Taylor Coleridge's Xanadu?,Alph,,arts,,,,rtg,
64536,rtg_0x005026,Who played James Bond in Diamonds Are Forever?,Sean Connery,,Entertainment,,Question,False,rtg_0x005026,Who played James Bond in Diamonds Are Forever?,Sean Connery,,entertainment,,,,rtg,
64537,rtg_0x005027,Who is known as Slim Shady?,Marshall Mathers,,Entertainment,,Question,False,rtg_0x005027,Who is known as Slim Shady?,Marshall Mathers,,entertainment,,,,rtg,
