# Setup

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from Levenshtein import ratio as stringSimilarity

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 5
plt.rcParams["figure.figsize"] = fig_size

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)


In [2]:
# Import master data file

t_data = pd.read_pickle('../workproduct-files/cleaned-dataframes/t_dataMaster.pkl')

# Overlap analysis

## String similarity

### Functions and function calls to run 'from Levenshtein import ratio as stringSimilarity' calculations

In [3]:
# calculates similarity between single string y and list of strings in x
def similFunc(x, y):
    ret = []
    ret = x.apply(lambda x: stringSimilarity(x, y))
    
    # Removes all values in ret that are smaller than a cutoff (=0.7, mean lies around 0.4)
    toNaN = ret.apply(lambda x: x < 0.8)
    ret[toNaN] = np.nan
    
    ret = ret.tolist()
    return ret

In [4]:
def compareStrings(d, parts):
    
    # 1. input is data (d) Series and nr to split it into (parts)
    # 2. while i <  parts:
    #         run the similFunc on rows / parts and d
    
    for i in range(0, parts):
        
        startInd = int(np.floor(len(d) / parts) * i)
        
        if i == parts - 1:
            endInd = None
        else:
            endInd = int(np.floor(len(d) / parts) * (i + 1))
        
        qSimil = pd.DataFrame(columns = ['q'])
        qSimil['q'] = d[startInd:endInd]
        
        asLists = qSimil['q'].apply(lambda x: similFunc(d, x) )

        # Creates matrix (n*n)
        calcs = asLists.apply(pd.Series)
        qSimil = pd.concat([qSimil[:], calcs[:]], axis = 1)
        
        # Drops question columns
        qSimil.drop('q', axis = 1, inplace = True)
        
        # Performs melt
        melt = qSimil.reset_index()
        melt = melt.melt(id_vars = ['index'] )
        melt.rename(columns = {'index' : 'i', 'variable' : 'j', 'value' : 'similarityRatio'}, inplace = True)
        
        # Drops NaN's
        melt = melt.dropna()
        
        # Drops redundant i,j combinations
        dropI = melt.loc[melt['i'] <= melt['j']].index
        melt.drop(dropI, inplace = True)
        melt.reset_index(drop = True, inplace = True)
        
        #Save part to pickle
        fileI = str(i+1)
        if len(fileI) < len(str(parts)):
            pre = '0' * (len(str(parts)) - len(fileI))
            fileI = pre + fileI

        #print(melt)
        melt.to_pickle('../workproduct-files/similarityMatrix/partialDataFrames/similarityM-' + str(fileI) + 'of' + str(parts) + '.pkl')
        print(str(fileI) + ' of ' + str(parts))
    

In [5]:
# set allQuestions from master data and save 'CONS_id' to index mapping table

allQuestions = t_data['CONS_question']
IDtoIndex = t_data['CONS_id'].reset_index()

In [6]:
similData = allQuestions
partsToSplit = 1000

%time compareStrings(similData, partsToSplit)
IDtoIndex.to_pickle('../workproduct-files/similarityMatrix/IDtoIndex.pkl')

0001 of 1000
0002 of 1000
0003 of 1000
0004 of 1000
0005 of 1000
0006 of 1000
0007 of 1000
0008 of 1000
0009 of 1000
0010 of 1000
0011 of 1000
0012 of 1000
0013 of 1000
0014 of 1000
0015 of 1000
0016 of 1000
0017 of 1000
0018 of 1000
0019 of 1000
0020 of 1000
0021 of 1000
0022 of 1000
0023 of 1000
0024 of 1000
0025 of 1000
0026 of 1000
0027 of 1000
0028 of 1000
0029 of 1000
0030 of 1000
0031 of 1000
0032 of 1000
0033 of 1000
0034 of 1000
0035 of 1000
0036 of 1000
0037 of 1000
0038 of 1000
0039 of 1000
0040 of 1000
0041 of 1000
0042 of 1000
0043 of 1000
0044 of 1000
0045 of 1000
0046 of 1000
0047 of 1000
0048 of 1000
0049 of 1000
0050 of 1000
0051 of 1000
0052 of 1000
0053 of 1000
0054 of 1000
0055 of 1000
0056 of 1000
0057 of 1000
0058 of 1000
0059 of 1000
0060 of 1000
0061 of 1000
0062 of 1000
0063 of 1000
0064 of 1000
0065 of 1000
0066 of 1000
0067 of 1000
0068 of 1000
0069 of 1000
0070 of 1000
0071 of 1000
0072 of 1000
0073 of 1000
0074 of 1000
0075 of 1000
0076 of 1000
0077 of 1000