In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('data/finalMatchYelp.csv')

In [4]:
df

Unnamed: 0,YelpLeaf,YelpParent1,YelpParent2,YelpParent3,MatchLeaf,MatchParent1,MatchParent2,MatchParent3,Tradition,Self-Expression,...,Neighborliness,Formality,Exhibitionism,Glamour,Transgression,Rationality,Locality,State,Corporateness,Ethnicity
0,3D Printing,Local Services,-,-,"Digital Imaging, Printing & Photography",Computer Services,-,-,3.0,4.0,...,3.0,3.0,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0
1,Abruzzese,Italian,Restaurants,-,Italian Restaurant,-,-,-,4.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0
2,Absinthe Bars,Bars,Nightlife,-,Bars & Pubs,-,-,-,3.0,3.0,...,3.8,3.0,3.2,3.0,3.0,2.25,3.50,2.75,3.0,3.0
3,Acai Bowls,Food,-,-,Health Food Restaurant,-,-,-,2.8,3.2,...,3.0,3.0,3.0,3.0,3.0,4.00,3.25,3.00,3.0,3.0
4,Accessories,Fashion,Shopping,-,Accessories-Fashion,Accessories,-,-,3.0,4.0,...,3.0,3.0,3.0,4.0,3.0,3.00,3.00,3.00,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1538,Awnings,Local Services,-,-,Docks & Dock Builders,-,-,-,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0
1539,Alternative Medicine,Health & Medical,-,-,Massages & Alternative Therapies,-,-,-,3.0,4.0,...,3.0,3.0,3.0,3.0,3.0,2.00,3.00,3.00,3.0,3.0
1540,Window Washing,Home Services,-,-,Contractors-General,-,-,-,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0
1541,Ethnic Grocery,Food,-,-,Farmers Markets,-,-,-,4.0,3.5,...,4.0,3.0,3.0,3.0,3.0,3.00,4.50,3.00,2.0,3.0


In [5]:
def calculate_similarity_sentences(sentences_estab, sentences_yelp):
    """
    Calculates the semantic textual similarity between the Yelp sentences and the establishments sentences, 
    using a Sentence Transformer model to generate the embeddings and the cosine similarity to calculate the distance between the vectors.
    And retrieves for each establishment sentence, the Yelp sentence with the highest score.

    Parameters
    ----------
    sentences_estab: pandas.core.series.Series
        The establishments sentences.
    sentences_yelp: pandas.core.series.Series
        The Yelp sentences.

    Raises
    ------
    No Raises.

    Returns
    -------
    pandas.core.frame.DataFrame
        The establishments sentences combined with the best-scoring Yelp sentences.
    """

    model = SentenceTransformer('all-MiniLM-L6-v2')

    embeddings_estab = model.encode(sentences_estab, convert_to_tensor=True)
    embeddings_yelp = model.encode(sentences_yelp, convert_to_tensor=True)

    cosine_scores = util.cos_sim(embeddings_estab, embeddings_yelp)

    rows = len(sentences_estab)
    columns = len(sentences_yelp)
    pairs_final = []
    for i in range(rows):
        pairs = []
        for j in range(columns):
            pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
        pairs_final.append(pairs)

    best_scores = []
    for pairs in pairs_final:
        pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
        best_scores.append(pairs[0])

    phrase_estab = []
    phrase_yelp = []
    score = []
    for pair in best_scores:
        i, j = pair['index']
        phrase_estab.append(sentences_estab[i])
        phrase_yelp.append(sentences_yelp[j])
        score.append(round(float(pair['score']), 4))

    df_score = pd.DataFrame()
    df_score['phrase_establishment'] = phrase_estab
    df_score['phrase_yelp'] = phrase_yelp
    df_score['score'] = score

    return df_score

In [6]:
sentences_estab = df['MatchLeaf']
sentences_yelp = df['YelpLeaf']

In [7]:
calculate_similarity_sentences(sentences_estab, sentences_yelp)

Unnamed: 0,phrase_establishment,phrase_yelp,score
0,"Digital Imaging, Printing & Photography",3D Printing,0.6138
1,Italian Restaurant,Italian,0.6708
2,Bars & Pubs,Pubs,0.8394
3,Health Food Restaurant,Fast Food,0.6161
4,Accessories-Fashion,Accessories,0.8068
...,...,...,...
1538,Docks & Dock Builders,Fuel Docks,0.6139
1539,Massages & Alternative Therapies,Massage Therapy,0.8384
1540,Contractors-General,Contractors,0.8872
1541,Farmers Markets,Farmers Market,0.9605


In [None]:
df['MatchPhrase'] = df['MatchLeaf'] + " " + df['MatchParent1'].replace('-', '') + " " + df['MatchParent2'].replace('-', '') + " " + df['MatchParent3'].replace('-', '') # TODO: 
df['YelpPhrase'] = df['YelpLeaf'] + " " + df['YelpParent1'].replace('-', '') + " " + df['YelpParent2'].replace('-', '') + " " + df['YelpParent3'].replace('-', '')

In [9]:
df["MatchPhrase"]

0       Digital Imaging, Printing & Photography Comput...
1                                   Italian Restaurant   
2                                          Bars & Pubs   
3                               Health Food Restaurant   
4                       Accessories-Fashion Accessories  
                              ...                        
1538                             Docks & Dock Builders   
1539                  Massages & Alternative Therapies   
1540                               Contractors-General   
1541                                   Farmers Markets   
1542                                  Amusement Places   
Name: MatchPhrase, Length: 1543, dtype: object

In [10]:
df["YelpPhrase"]

0                  3D Printing Local Services  
1                Abruzzese Italian Restaurants 
2                 Absinthe Bars Bars Nightlife 
3                             Acai Bowls Food  
4                 Accessories Fashion Shopping 
                         ...                   
1538                   Awnings Local Services  
1539    Alternative Medicine Health & Medical  
1540             Window Washing Home Services  
1541                      Ethnic Grocery Food  
1542                    Aquariums Active Life  
Name: YelpPhrase, Length: 1543, dtype: object

In [11]:
df_phrase_score = calculate_similarity_sentences(df["MatchPhrase"], df["YelpPhrase"])

In [12]:
df_phrase_score.head(20)

Unnamed: 0,phrase_establishment,phrase_yelp,score
0,"Digital Imaging, Printing & Photography Comput...",Real Estate Photography Real Estate Services R...,0.5793
1,Italian Restaurant,Italian Restaurants,0.959
2,Bars & Pubs,Pubs Bars Nightlife,0.8947
3,Health Food Restaurant,Fast Food Restaurants,0.7442
4,Accessories-Fashion Accessories,Accessories Fashion Shopping,0.9201
5,Accountants Accounting Services,Accountants Professional Services,0.918
6,Skin Care-Products & Treatments Skin & scalp s...,Skin Care Beauty & Spas,0.6689
7,Acupuncturists Alternative Health Care,Acupuncture Health & Medical,0.7359
8,Addiction-Information & Treatment Centres Ther...,Addiction Medicine Doctors Health & Medical,0.5595
9,Adoption Services Children & Child Care,Child Care & Day Care Local Services,0.7242


In [13]:
# 	phrase_establishment	phrase_yelp	score
# 0	Digital Imaging, Printing & Photography Comput...	Real Estate Photography Real Estate Services R...	0.5793
# 1	Italian Restaurant	Italian Restaurants	0.9590
# 2	Bars & Pubs	Pubs Bars Nightlife	0.8947
# 3	Health Food Restaurant	Fast Food Restaurants	0.7442
# 4	Accessories-Fashion Accessories	Accessories Fashion Shopping	0.9201
# ...	...	...	...
# 1538	Docks & Dock Builders	Fuel Docks Automotive	0.5512
# 1539	Massages & Alternative Therapies	Massage Therapy Health & Medical	0.7953
# 1540	Contractors-General	Contractors Home Services	0.6703
# 1541	Farmers Markets	Farmers Market Food	0.8848
# 1542	Amusement Places	Amusement Parks Active Life	0.6497


In [14]:
df

Unnamed: 0,YelpLeaf,YelpParent1,YelpParent2,YelpParent3,MatchLeaf,MatchParent1,MatchParent2,MatchParent3,Tradition,Self-Expression,...,Exhibitionism,Glamour,Transgression,Rationality,Locality,State,Corporateness,Ethnicity,MatchPhrase,YelpPhrase
0,3D Printing,Local Services,-,-,"Digital Imaging, Printing & Photography",Computer Services,-,-,3.0,4.0,...,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0,"Digital Imaging, Printing & Photography Comput...",3D Printing Local Services
1,Abruzzese,Italian,Restaurants,-,Italian Restaurant,-,-,-,4.0,3.0,...,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0,Italian Restaurant,Abruzzese Italian Restaurants
2,Absinthe Bars,Bars,Nightlife,-,Bars & Pubs,-,-,-,3.0,3.0,...,3.2,3.0,3.0,2.25,3.50,2.75,3.0,3.0,Bars & Pubs,Absinthe Bars Bars Nightlife
3,Acai Bowls,Food,-,-,Health Food Restaurant,-,-,-,2.8,3.2,...,3.0,3.0,3.0,4.00,3.25,3.00,3.0,3.0,Health Food Restaurant,Acai Bowls Food
4,Accessories,Fashion,Shopping,-,Accessories-Fashion,Accessories,-,-,3.0,4.0,...,3.0,4.0,3.0,3.00,3.00,3.00,3.0,3.0,Accessories-Fashion Accessories,Accessories Fashion Shopping
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1538,Awnings,Local Services,-,-,Docks & Dock Builders,-,-,-,3.0,3.0,...,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0,Docks & Dock Builders,Awnings Local Services
1539,Alternative Medicine,Health & Medical,-,-,Massages & Alternative Therapies,-,-,-,3.0,4.0,...,3.0,3.0,3.0,2.00,3.00,3.00,3.0,3.0,Massages & Alternative Therapies,Alternative Medicine Health & Medical
1540,Window Washing,Home Services,-,-,Contractors-General,-,-,-,3.0,3.0,...,3.0,3.0,3.0,3.00,3.00,3.00,3.0,3.0,Contractors-General,Window Washing Home Services
1541,Ethnic Grocery,Food,-,-,Farmers Markets,-,-,-,4.0,3.5,...,3.0,3.0,3.0,3.00,4.50,3.00,2.0,3.0,Farmers Markets,Ethnic Grocery Food
