seo text similarity metrics
as discussed in
https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

read files and store text in dict
the script assumes an autotext and human text csv files are locates in ./data folder

In [2]:
files = ! ls ./data/*.csv
files
text_dict = {}
for file in files:
    print(file)
    dict_key = file.split('/')[-1].replace(' ','_').lower().split('.')[0]
    text_dict[dict_key] = pd.read_csv(file, header=None).iloc[:,-1]
text_dict = OrderedDict(sorted(text_dict.items()))

./data/backpacks_uk_autotext.csv
./data/backpacks_uk_human_written_texts.csv


importer for *messy* filenames and inconsistent colnames, however, TeXt must be the last column

https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes

jaccard

In [3]:
def get_jaccard_sim(str1, str2):
    '''with prior stripping of text characters like comma ,dot etc.'''
    chars = ['\\´', '\\’' ,'\'', '\\.', '\\,', '\\!', '\\?', '\\:', '\\;']
    
    a = set(re.sub("|".join(chars), '', str1).split(' ')) 
    b = set(re.sub("|".join(chars), '', str2).split(' '))
    
    c = a & b
    #c = a.intersection(b)
    #display(a-c, b-c, c)
    
    return float(len(c)) / (len(a) + len(b) - len(c))

def jaccard_process(series, mode=''):
    '''takes a series or list object as input and returns a list of jaccard similarities of within that list, excluding the diagonal elements, i.e. series[1] with series[1]'''
    indices = range(len(series))
    matrix = pd.DataFrame(index=indices, columns=indices)
    matrix = np.zeros([len(series),len(series)])

    for i in indices:
        for j in indices[i:]: # for only calculating half the matrix: `for j in indices[i:]`
            matrix[i,j] = get_jaccard_sim(series[i], series[j])
            
    matrix = matrix.astype('float') # neccessary
    if mode == 'det':
        print('raw matrix, np.triu mask, masked half triangle without diagonals')
        return matrix, np.triu_indices(len(series), k=1), matrix[np.triu_indices(len(series), k=1)]
    elif mode == 'mat':
        return pd.DataFrame(matrix).round(3).style.background_gradient('Greens')
    else:
        return matrix[np.triu_indices(len(series), k=1)]

automated together

In [4]:
for ky in text_dict.keys():
    print(ky, len(text_dict[ky]))

backpacks_uk_autotext 892
backpacks_uk_human_written_texts 143


In [5]:
iter_list = np.reshape(list(text_dict.keys()), [int(len(text_dict)/2),2])
iter_list

array([['backpacks_uk_autotext', 'backpacks_uk_human_written_texts']],
      dtype='<U32')

looping through file list
individual combinations

In [6]:
for i in iter_list:
    print(i)

['backpacks_uk_autotext' 'backpacks_uk_human_written_texts']


without sampling and density set

In [None]:
seed = 4243
size = 'full'
fields = ['count','mean','50%','std']

for i in iter_list:
    
    file_list = i
    min_length = min([len(text_dict[key]) for key in i])

    print(f'jaccard similarity, full set')
    for key in file_list:
        sent = text_dict[key].astype('str').values
        #sent = text_dict[key].sample(size, random_state=seed).astype('str').values
        jaccard = pd.Series(jaccard_process(sent))
        print(key,'\n', jaccard.describe()[fields])
        jaccard.hist(bins=np.linspace(0,1,101), figsize=[15,10], alpha=0.5, density=True)
        plt.legend(file_list)
    plt.show()