### Instructions
Please run the cells in this notebook sequentially to avoid errors. The second cell downloads necessary data packages, which are required for the subsequent cells to run correctly.

In [34]:
import pandas as pd
df = pd.read_csv('../data_gpt_labeler/final_data_sampled.csv')
df_sample = df.sample(n=5, random_state=42).copy()
df_sample.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id
6252,617643,5,Good fast Korean food. You can get an extra si...,Sam's Delicatessen,['Restaurant'],Casual counter-serve joint offering a mix of K...,1.0007936986998396e+20_1488457039508
4684,799373,4,Located in the small ranching town of Waimea o...,Merriman's Big Island,"['American restaurant', 'Bar', 'Restaurant']","Inventive, locally sourced menu of regional di...",1.1442347662180223e+20_1592251909308
1731,188913,3,It's all the way on the West Coast,Ewa Pointe Marketplace,['Plaza'],Outdoor shopping center with a variety of reta...,1.0990275060241975e+20_1507431452922
4742,763245,4,Great selection of Tequilas and Jo the bartner...,Mi Almita Cantina,"['Mexican restaurant', 'Restaurant']",Mexican cantina-inspired fare by a James Beard...,1.0158679044462235e+20_1584744923509
4521,399817,4,Lots of stunning views. Long hike left at 8:3...,Mauna Loa,['Volcano'],"Home to Mauna Loa Observatory, the world's lar...",1.1202399081333984e+20_1559076728343


In [36]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/yumin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/yumin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/yumin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/yumin/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Training Phase
Fit the TfidfVectorizer on the entire corpus of reviews. In a real application, you would save this fitted vectorizer to a file.

In [38]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(df['text'].fillna(''))

### Inference Phase
Now we can use the pre-fitted vectorizer to calculate scores for single reviews.

In [40]:
def lexical_richness(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        if len(tokens) == 0:
            return 0
        return len(set(tokens)) / len(tokens)
    return 0

df_sample['lexical_richness'] = df_sample['text'].apply(lexical_richness)
df_sample.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id,lexical_richness
6252,617643,5,Good fast Korean food. You can get an extra si...,Sam's Delicatessen,['Restaurant'],Casual counter-serve joint offering a mix of K...,1.0007936986998396e+20_1488457039508,0.952381
4684,799373,4,Located in the small ranching town of Waimea o...,Merriman's Big Island,"['American restaurant', 'Bar', 'Restaurant']","Inventive, locally sourced menu of regional di...",1.1442347662180223e+20_1592251909308,0.717241
1731,188913,3,It's all the way on the West Coast,Ewa Pointe Marketplace,['Plaza'],Outdoor shopping center with a variety of reta...,1.0990275060241975e+20_1507431452922,0.888889
4742,763245,4,Great selection of Tequilas and Jo the bartner...,Mi Almita Cantina,"['Mexican restaurant', 'Restaurant']",Mexican cantina-inspired fare by a James Beard...,1.0158679044462235e+20_1584744923509,1.0
4521,399817,4,Lots of stunning views. Long hike left at 8:3...,Mauna Loa,['Volcano'],"Home to Mauna Loa Observatory, the world's lar...",1.1202399081333984e+20_1559076728343,0.84127


In [42]:
def named_entity_density(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        if len(tokens) == 0:
            return 0
        tagged_tokens = pos_tag(tokens)
        chunks = ne_chunk(tagged_tokens)
        named_entities = 0
        for chunk in chunks:
            if hasattr(chunk, 'label'):
                named_entities += 1
        return named_entities / len(tokens)
    return 0

df_sample['named_entity_density'] = df_sample['text'].apply(named_entity_density)
df_sample.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id,lexical_richness,named_entity_density
6252,617643,5,Good fast Korean food. You can get an extra si...,Sam's Delicatessen,['Restaurant'],Casual counter-serve joint offering a mix of K...,1.0007936986998396e+20_1488457039508,0.952381,0.095238
4684,799373,4,Located in the small ranching town of Waimea o...,Merriman's Big Island,"['American restaurant', 'Bar', 'Restaurant']","Inventive, locally sourced menu of regional di...",1.1442347662180223e+20_1592251909308,0.717241,0.048276
1731,188913,3,It's all the way on the West Coast,Ewa Pointe Marketplace,['Plaza'],Outdoor shopping center with a variety of reta...,1.0990275060241975e+20_1507431452922,0.888889,0.111111
4742,763245,4,Great selection of Tequilas and Jo the bartner...,Mi Almita Cantina,"['Mexican restaurant', 'Restaurant']",Mexican cantina-inspired fare by a James Beard...,1.0158679044462235e+20_1584744923509,1.0,0.181818
4521,399817,4,Lots of stunning views. Long hike left at 8:3...,Mauna Loa,['Volcano'],"Home to Mauna Loa Observatory, the world's lar...",1.1202399081333984e+20_1559076728343,0.84127,0.015873


In [44]:
def get_average_tfidf_single(text, vectorizer):
    if isinstance(text, str):
        tfidf_matrix = vectorizer.transform([text])
        feature_names = vectorizer.get_feature_names_out()
        doc_vector = tfidf_matrix[0]
        df_tfidf = pd.DataFrame(doc_vector.T.todense(), index=feature_names, columns=['tfidf'])
        return df_tfidf['tfidf'].mean()
    return 0

df_sample['tfidf_score'] = df_sample['text'].apply(lambda x: get_average_tfidf_single(x, tfidf_vectorizer))
df_sample.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id,lexical_richness,named_entity_density,tfidf_score
6252,617643,5,Good fast Korean food. You can get an extra si...,Sam's Delicatessen,['Restaurant'],Casual counter-serve joint offering a mix of K...,1.0007936986998396e+20_1488457039508,0.952381,0.095238,0.000252
4684,799373,4,Located in the small ranching town of Waimea o...,Merriman's Big Island,"['American restaurant', 'Bar', 'Restaurant']","Inventive, locally sourced menu of regional di...",1.1442347662180223e+20_1592251909308,0.717241,0.048276,0.000592
1731,188913,3,It's all the way on the West Coast,Ewa Pointe Marketplace,['Plaza'],Outdoor shopping center with a variety of reta...,1.0990275060241975e+20_1507431452922,0.888889,0.111111,0.000155
4742,763245,4,Great selection of Tequilas and Jo the bartner...,Mi Almita Cantina,"['Mexican restaurant', 'Restaurant']",Mexican cantina-inspired fare by a James Beard...,1.0158679044462235e+20_1584744923509,1.0,0.181818,0.000196
4521,399817,4,Lots of stunning views. Long hike left at 8:3...,Mauna Loa,['Volcano'],"Home to Mauna Loa Observatory, the world's lar...",1.1202399081333984e+20_1559076728343,0.84127,0.015873,0.00047


In [46]:
def review_length(text):
    if isinstance(text, str):
        return len(text)
    return 0

df_sample['review_length'] = df_sample['text'].apply(review_length)
df_sample.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id,lexical_richness,named_entity_density,tfidf_score,review_length
6252,617643,5,Good fast Korean food. You can get an extra si...,Sam's Delicatessen,['Restaurant'],Casual counter-serve joint offering a mix of K...,1.0007936986998396e+20_1488457039508,0.952381,0.095238,0.000252,96
4684,799373,4,Located in the small ranching town of Waimea o...,Merriman's Big Island,"['American restaurant', 'Bar', 'Restaurant']","Inventive, locally sourced menu of regional di...",1.1442347662180223e+20_1592251909308,0.717241,0.048276,0.000592,720
1731,188913,3,It's all the way on the West Coast,Ewa Pointe Marketplace,['Plaza'],Outdoor shopping center with a variety of reta...,1.0990275060241975e+20_1507431452922,0.888889,0.111111,0.000155,34
4742,763245,4,Great selection of Tequilas and Jo the bartner...,Mi Almita Cantina,"['Mexican restaurant', 'Restaurant']",Mexican cantina-inspired fare by a James Beard...,1.0158679044462235e+20_1584744923509,1.0,0.181818,0.000196,56
4521,399817,4,Lots of stunning views. Long hike left at 8:3...,Mauna Loa,['Volcano'],"Home to Mauna Loa Observatory, the world's lar...",1.1202399081333984e+20_1559076728343,0.84127,0.015873,0.00047,310


In [50]:
import math

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def log_scale(x):
    return math.log(1+x)/math.log(1+2000)

df_sample['lexical_richness_norm'] = df_sample['lexical_richness'].apply(sigmoid)
df_sample['named_entity_density_norm'] = df_sample['named_entity_density'].apply(sigmoid)
df_sample['tfidf_score_norm'] = df_sample['tfidf_score'].apply(sigmoid)
df_sample['review_length_norm'] = df_sample['review_length'].apply(log_scale)

df_sample.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id,lexical_richness,named_entity_density,tfidf_score,review_length,lexical_richness_norm,named_entity_density_norm,tfidf_score_norm,review_length_norm
6252,617643,5,Good fast Korean food. You can get an extra si...,Sam's Delicatessen,['Restaurant'],Casual counter-serve joint offering a mix of K...,1.0007936986998396e+20_1488457039508,0.952381,0.095238,0.000252,96,0.721594,0.523792,0.500063,0.601825
4684,799373,4,Located in the small ranching town of Waimea o...,Merriman's Big Island,"['American restaurant', 'Bar', 'Restaurant']","Inventive, locally sourced menu of regional di...",1.1442347662180223e+20_1592251909308,0.717241,0.048276,0.000592,720,0.671999,0.512067,0.500148,0.865714
1731,188913,3,It's all the way on the West Coast,Ewa Pointe Marketplace,['Plaza'],Outdoor shopping center with a variety of reta...,1.0990275060241975e+20_1507431452922,0.888889,0.111111,0.000155,34,0.708661,0.527749,0.500039,0.467723
4742,763245,4,Great selection of Tequilas and Jo the bartner...,Mi Almita Cantina,"['Mexican restaurant', 'Restaurant']",Mexican cantina-inspired fare by a James Beard...,1.0158679044462235e+20_1584744923509,1.0,0.181818,0.000196,56,0.731059,0.54533,0.500049,0.531882
4521,399817,4,Lots of stunning views. Long hike left at 8:3...,Mauna Loa,['Volcano'],"Home to Mauna Loa Observatory, the world's lar...",1.1202399081333984e+20_1559076728343,0.84127,0.015873,0.00047,310,0.698733,0.503968,0.500118,0.755097


In [52]:
weights = {
    'lexical_richness_norm': 0.2,
    'named_entity_density_norm': 0.4,
    'tfidf_score_norm': 0.2,
    'review_length_norm': 0.2
}

df_sample['specificity_score'] = (
    df_sample['lexical_richness_norm'] * weights['lexical_richness_norm'] +
    df_sample['named_entity_density_norm'] * weights['named_entity_density_norm'] +
    df_sample['tfidf_score_norm'] * weights['tfidf_score_norm'] +
    df_sample['review_length_norm'] * weights['review_length_norm']
)

df_sample.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id,lexical_richness,named_entity_density,tfidf_score,review_length,lexical_richness_norm,named_entity_density_norm,tfidf_score_norm,review_length_norm,specificity_score
6252,617643,5,Good fast Korean food. You can get an extra si...,Sam's Delicatessen,['Restaurant'],Casual counter-serve joint offering a mix of K...,1.0007936986998396e+20_1488457039508,0.952381,0.095238,0.000252,96,0.721594,0.523792,0.500063,0.601825,0.574213
4684,799373,4,Located in the small ranching town of Waimea o...,Merriman's Big Island,"['American restaurant', 'Bar', 'Restaurant']","Inventive, locally sourced menu of regional di...",1.1442347662180223e+20_1592251909308,0.717241,0.048276,0.000592,720,0.671999,0.512067,0.500148,0.865714,0.612399
1731,188913,3,It's all the way on the West Coast,Ewa Pointe Marketplace,['Plaza'],Outdoor shopping center with a variety of reta...,1.0990275060241975e+20_1507431452922,0.888889,0.111111,0.000155,34,0.708661,0.527749,0.500039,0.467723,0.546384
4742,763245,4,Great selection of Tequilas and Jo the bartner...,Mi Almita Cantina,"['Mexican restaurant', 'Restaurant']",Mexican cantina-inspired fare by a James Beard...,1.0158679044462235e+20_1584744923509,1.0,0.181818,0.000196,56,0.731059,0.54533,0.500049,0.531882,0.57073
4521,399817,4,Lots of stunning views. Long hike left at 8:3...,Mauna Loa,['Volcano'],"Home to Mauna Loa Observatory, the world's lar...",1.1202399081333984e+20_1559076728343,0.84127,0.015873,0.00047,310,0.698733,0.503968,0.500118,0.755097,0.592377


In [150]:
df_sample["text"][4521]

'Lots of stunning views.  Long hike left at 8:30am and still did not make it back to the car until 9pm.  Luckily we had brought flashlights so we were okay. Despite drinking a lot of fluids (2 gallons) still got altitude sickness anyway.  Had a great day, reached the summit and have many fantastic photographs.'