# Setup

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from Levenshtein import ratio as stringSimilarity
import string
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 5
plt.rcParams["figure.figsize"] = fig_size

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)


In [None]:
# Import master data file

t_data = pd.read_pickle('../workproduct-files/cleaned-dataframes/t_dataMaster.pkl')

# Overlap analysis

## String similarity

### Functions and function calls to run 'from Levenshtein import ratio as stringSimilarity' calculations

In [None]:
# calculates similarity between single string y and list of strings in x
def similFunc(x, y):
    ret = []
    ret = x.apply(lambda x: stringSimilarity(x.lower(), y.lower()))
    
    # Removes all values in ret that are smaller than a cutoff (=0.7, mean lies around 0.4)
    toNaN = ret.apply(lambda x: x < 0.8)
    ret[toNaN] = np.nan
    
    ret = ret.tolist()
    return ret

In [None]:
def compareStrings(d, parts):
    
    # 1. input is data (d) Series and nr to split it into (parts)
    # 2. while i <  parts:
    #         run the similFunc on rows / parts and d
    
    for i in range(0, parts):
        
        startInd = int(np.floor(len(d) / parts) * i)
        
        if i == parts - 1:
            endInd = None
        else:
            endInd = int(np.floor(len(d) / parts) * (i + 1))
        
        qSimil = pd.DataFrame(columns = ['q'])
        qSimil['q'] = d[startInd:endInd]
        
        asLists = qSimil['q'].apply(lambda x: similFunc(d, x) )

        # Creates matrix (n*n)
        calcs = asLists.apply(pd.Series)
        qSimil = pd.concat([qSimil[:], calcs[:]], axis = 1)
        
        # Drops question columns
        qSimil.drop('q', axis = 1, inplace = True)
        
        # Performs melt
        melt = qSimil.reset_index()
        melt = melt.melt(id_vars = ['index'] )
        melt.rename(columns = {'index' : 'i', 'variable' : 'j', 'value' : 'similarityRatio'}, inplace = True)
        
        # Drops NaN's
        melt = melt.dropna()
        
        # Drops redundant i,j combinations
        dropI = melt.loc[melt['i'] <= melt['j']].index
        melt.drop(dropI, inplace = True)
        melt.reset_index(drop = True, inplace = True)
        
        #Save part to pickle
        fileI = str(i+1)
        if len(fileI) < len(str(parts)):
            pre = '0' * (len(str(parts)) - len(fileI))
            fileI = pre + fileI

        #print(melt)
        melt.to_pickle('../workproduct-files/similarityMatrix/partialDataFrames/similarityM-' + str(fileI) + 'of' + str(parts) + '.pkl')
        print(str(fileI) + ' of ' + str(parts))
    

In [None]:
# Removes all punctuation characters and whitespaces from string
def removeStringFormatting(x):
    x2 = string.punctuation + ' '
    for c in x2:
        x = x.replace(c, '')
    return x.lower()

In [None]:
# set allQuestions from master data and save 'CONS_id' to index mapping table

allQuestions = t_data['CONS_question']
IDtoIndex = t_data['CONS_id'].reset_index()

In [None]:
%time similData = allQuestions.apply(lambda x: removeStringFormatting(x))

In [None]:
partsToSplit = 1000

%time compareStrings(similData, partsToSplit)

IDtoIndex.to_pickle('../workproduct-files/similarityMatrix/IDtoIndex.pkl')

## Creating consolidated similarity matrix and adding IDs and questions

In [None]:
%%time
similToPlot = pd.DataFrame()
location = '../workproduct-files/similarityMatrix/partialDataFrames/'
for file in sorted(os.listdir(location)):
    similToPlot = similToPlot.append(pd.read_pickle(location + file))
similToPlot.reset_index(drop = True, inplace = True)

IDtoIndex = pd.read_pickle('../workproduct-files/similarityMatrix/IDtoIndex.pkl')
t_data = pd.read_pickle('../workproduct-files/cleaned-dataframes/t_dataMaster.pkl')
print(len(similToPlot))

In [None]:
%%time
similToPlot['i-id'] = similToPlot['i'].apply(lambda x: IDtoIndex.loc[IDtoIndex['index'] == x, 'CONS_id'].iloc[0])
print('first row done')
similToPlot['j-id'] = similToPlot['j'].apply(lambda x: IDtoIndex.loc[IDtoIndex['index'] == x, 'CONS_id'].iloc[0])
print('second row done')
similToPlot['i-question'] = similToPlot['i-id'].apply(lambda x: t_data.loc[t_data['CONS_id'] == x, 'CONS_question'].iloc[0])
print('third row done')
similToPlot['j-question'] = similToPlot['j-id'].apply(lambda x: t_data.loc[t_data['CONS_id'] == x, 'CONS_question'].iloc[0])
print('All done')

In [None]:
similToPlotSorted = similToPlot.sort_values('similarityRatio', ascending = False).reset_index(drop = True)
similToPlotSorted.to_pickle('../workproduct-files/similarityMatrix/FullSimilarityComparison.pkl')