In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/dataset_rent_rome_kijiji.tsv', sep='\t', header=0,
                   usecols=['Title', 'Short Description'])
data.head()

Unnamed: 0,Title,Short Description
0,Studio accessoriato vicino metro A Furio Camillo,Affitto studio a professionisti preferibilment...
1,"Negozio 169Mq per laboratorio, ufficio, studio...","Privato affitta negozio 169 mq, al piano terra..."
2,Negozio in tiburtina centro,Negozio c/1 roma tiburtina centro via eugenio ...
3,Studio medico via anapo parco nemorense,"Studio medico avviato, composto da tre studi c..."
4,Cerco: Appartamento per donna lavoratrice refe...,"Donna lavoratrice, non residente, con reddito ..."


In [3]:
class KShingles:
    def __init__(self, k):
        self.k = k

    def __call__(self, entry):
        return set(entry[i:i + self.k] for i in range(len(entry) - self.k + 1))


KShingles(3)('paper')

{'ape', 'pap', 'per'}

In [4]:
k = 5
k_shingles = KShingles(k)

data = data.map(str.lower)
data['Title Document'] = data['Title'].apply(k_shingles)
# data['Description Document'] = data['Short Description'].apply(k_shingles)

data.head()

Unnamed: 0,Title,Short Description,Title Document
0,studio accessoriato vicino metro a furio camillo,affitto studio a professionisti preferibilment...,"{oriat, acces, vicin, io ca, camil, o cam, to ..."
1,"negozio 169mq per laboratorio, ufficio, studio...","privato affitta negozio 169 mq, al piano terra...","{ a ro, borat, oma. , per l, 169mq, q per, ato..."
2,negozio in tiburtina centro,negozio c/1 roma tiburtina centro via eugenio ...,"{ in t, io in, burti, urtin, centr, na ce, n t..."
3,studio medico via anapo parco nemorense,"studio medico avviato, composto da tre studi c...","{ico v, co ne, parco, po pa, o nem, moren, a a..."
4,cerco: appartamento per donna lavoratrice refe...,"donna lavoratrice, non residente, con reddito ...","{rco: , nziat, lavo, e ref, erenz, ament, r d..."


In [5]:
from itertools import chain

all_shingles = list(chain.from_iterable(data['Title Document']))
unique_shingles = sorted(set(all_shingles))
num_shingles = len(unique_shingles)

num_shingles

12840

In [6]:
# for faster lookup
shingle_index_map = {shingle: i for i, shingle in enumerate(unique_shingles)}


def shingle_vector(shingle_set):
    vec = np.zeros(len(unique_shingles), dtype=bool)
    for shingle in shingle_set: vec[shingle_index_map[shingle]] = True

    return vec


data['Title Document Vectors'] = data['Title Document'].apply(shingle_vector)

assert len(data['Title Document'][0]) == sum(data['Title Document Vectors'][0])
data.head()

Unnamed: 0,Title,Short Description,Title Document,Title Document Vectors
0,studio accessoriato vicino metro a furio camillo,affitto studio a professionisti preferibilment...,"{oriat, acces, vicin, io ca, camil, o cam, to ...","[False, False, False, False, False, False, Fal..."
1,"negozio 169mq per laboratorio, ufficio, studio...","privato affitta negozio 169 mq, al piano terra...","{ a ro, borat, oma. , per l, 169mq, q per, ato...","[False, False, False, False, False, False, Fal..."
2,negozio in tiburtina centro,negozio c/1 roma tiburtina centro via eugenio ...,"{ in t, io in, burti, urtin, centr, na ce, n t...","[False, False, False, False, False, False, Fal..."
3,studio medico via anapo parco nemorense,"studio medico avviato, composto da tre studi c...","{ico v, co ne, parco, po pa, o nem, moren, a a...","[False, False, False, False, False, False, Fal..."
4,cerco: appartamento per donna lavoratrice refe...,"donna lavoratrice, non residente, con reddito ...","{rco: , nziat, lavo, e ref, erenz, ament, r d...","[False, False, False, False, False, False, Fal..."


In [7]:
is_prime = lambda n: all(n % i for i in range(2, int(n ** 0.5) + 1))


def next_prime(n):
    while not is_prime(n): n += 1
    return n


next_prime(num_shingles)

12841

In [8]:
class HashFunction:
    def __init__(self, a, b, p, n): self.a, self.b, self.p, self.n = a, b, p, n

    def __call__(self, x): return (self.a * x + self.b) % self.p % self.n


H = 100  # n.o. hash functions
p = next_prime(num_shingles)

all_a = np.random.randint(1, num_shingles, H)
all_b = np.random.randint(0, num_shingles, H)

hs = np.array([
    # lambda x: (a * x + b) % p % num_shingles  # sadly doesn't work due to lambda reference storage
    HashFunction(a, b, p, num_shingles)
    for a, b in zip(all_a, all_b)
])

[h(0) for h in hs]

[7370,
 3011,
 1861,
 9926,
 9544,
 8663,
 80,
 7032,
 5956,
 8136,
 2132,
 8264,
 5826,
 7264,
 5663,
 11601,
 4467,
 5046,
 6495,
 10948,
 4486,
 5394,
 9641,
 9092,
 3039,
 7674,
 367,
 5677,
 10940,
 2543,
 1628,
 2875,
 4077,
 10656,
 5602,
 679,
 3172,
 6545,
 2132,
 8251,
 12140,
 805,
 11878,
 3619,
 5852,
 12206,
 2260,
 1831,
 11837,
 4649,
 8678,
 10256,
 5438,
 11018,
 2883,
 303,
 5230,
 10469,
 10398,
 5207,
 2494,
 12665,
 2429,
 5977,
 7629,
 4994,
 1399,
 9068,
 8070,
 2381,
 10092,
 5332,
 4681,
 3151,
 11379,
 3104,
 7344,
 11302,
 2086,
 5429,
 5525,
 5093,
 9497,
 4740,
 10155,
 4466,
 6516,
 11969,
 9002,
 10388,
 7890,
 10033,
 528,
 131,
 3763,
 70,
 11575,
 12370,
 6885,
 1554]

In [9]:
def minhash_vector(shingle_vector):
    minhashes = np.full_like(hs, np.inf)

    for shingle_i, present in enumerate(shingle_vector):
        if not present: continue
        for i, h in enumerate(hs):
            minhashes[i] = min(minhashes[i], h(shingle_i))

    return minhashes


data['Title MinHash Signature'] = data['Title Document Vectors'].apply(minhash_vector)

data.head()

Unnamed: 0,Title,Short Description,Title Document,Title Document Vectors,Title MinHash Signature
0,studio accessoriato vicino metro a furio camillo,affitto studio a professionisti preferibilment...,"{oriat, acces, vicin, io ca, camil, o cam, to ...","[False, False, False, False, False, False, Fal...","[224, 96, 30, 114, 241, 666, 565, 67, 283, 344..."
1,"negozio 169mq per laboratorio, ufficio, studio...","privato affitta negozio 169 mq, al piano terra...","{ a ro, borat, oma. , per l, 169mq, q per, ato...","[False, False, False, False, False, False, Fal...","[164, 291, 357, 98, 241, 224, 113, 142, 25, 43..."
2,negozio in tiburtina centro,negozio c/1 roma tiburtina centro via eugenio ...,"{ in t, io in, burti, urtin, centr, na ce, n t...","[False, False, False, False, False, False, Fal...","[846, 95, 31, 26, 109, 369, 355, 640, 44, 278,..."
3,studio medico via anapo parco nemorense,"studio medico avviato, composto da tre studi c...","{ico v, co ne, parco, po pa, o nem, moren, a a...","[False, False, False, False, False, False, Fal...","[419, 198, 822, 144, 19, 436, 126, 147, 276, 1..."
4,cerco: appartamento per donna lavoratrice refe...,"donna lavoratrice, non residente, con reddito ...","{rco: , nziat, lavo, e ref, erenz, ament, r d...","[False, False, False, False, False, False, Fal...","[164, 40, 127, 102, 78, 1, 357, 336, 329, 129,..."


With the code components provided, we can now modularize our work into reusable utility functions. This approach makes it straightforward to integrate these functions into other projects, such as the ongoing MinHash exercise. Specifically, we have:

- `KShingles`: A class to generate k-shingles from text entries.
- `HashFunction`: A class that defines and applies a simple hash function.
- `create_shingles_index`: A function to create an index mapping from unique shingles.
- `shingle_vector`: A function to convert a set of shingles into a vector of boolean values.
- `create_hashfunctions`: A function to create a list of hash functions for MinHash.
- `minhash_vector`: A function to compute the MinHash signature vector for a document.
- `MinHash`: A class that uses all of the above functions to return a MinHash of entries.

These utilities allow us to seamlessly process text data into k-shingles, create vector representations, and compute MinHash signatures. While we are not implementing Locality-Sensitive Hashing (LSH) to obtain candidate pairs yet, these foundational tools are essential for building towards that capability in future stages.

Below you will find the code using the util functions.

In [10]:
from util import *
import pandas as pd

data = pd.read_csv('data/dataset_rent_rome_kijiji.tsv', sep='\t', header=0,
                   usecols=['Title', 'Short Description'])
data = data.map(str.lower)
min_hash = MinHash(k=4)
signature = min_hash(data['Title'])

signature.head()

0    [198.0, 134.0, 102.0, 355.0, 80.0, 456.0, 5.0,...
1    [5.0, 333.0, 330.0, 155.0, 111.0, 86.0, 23.0, ...
2    [94.0, 29.0, 285.0, 71.0, 58.0, 190.0, 21.0, 8...
3    [36.0, 390.0, 409.0, 88.0, 155.0, 471.0, 31.0,...
4    [79.0, 124.0, 3.0, 41.0, 121.0, 1.0, 27.0, 48....
Name: Title, dtype: object