In [321]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [323]:
#Get ETL_DATE for Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('db_credentials.env')
load_dotenv(dotenv_path=dotenv_path)

ETL_DATE = os.getenv('ETL_DATE')

In [325]:
# For Incremental Ingestion (enabled if ETL_DATE set to 'CURRENT_DATE')
if ETL_DATE == 'CURRENT_DATE':
    ETL_DATE = datetime.today().strftime('%Y%m%d')
else:
   ETL_DATE = '20250322'
    
print(ETL_DATE)

20250328


In [327]:
df = pd.read_csv(f"data/silver_{ETL_DATE}_Airline_Reviews_Sentiment.csv")
print(len(df))
df.head()

52


Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended,Id,sentiment_label,sentiment_scores
0,0,Aer Lingus,2.0,“terrible customer service”,2025-03-26,False,Absolutely terrible customer service. Tried t...,,,Couple Leisure,...,3.0,3.0,1.0,,3.0,1,no,661172569262a3a1c15d87e9459a0fefc8664914266518...,negative,0.94876
1,1,Air France,10.0,"""excellent customer service""",2025-03-28,True,My son was travelling on Kids Solo as an un...,,,Family Leisure,...,,,5.0,,,5,yes,925fcfda402034e92b439424f9eb35bec83154e497e641...,neutral,0.701676
2,2,Air India,3.0,“it was not a pleasant experience”,2025-03-26,False,Recently travelled on Air India flight from T...,,,Family Leisure,...,3.0,1.0,4.0,1.0,1.0,1,no,c09a8175ab7eda10f0fddbbdc6227c44836d964901af2c...,negative,0.710657
3,3,Air India Express,1.0,"""not accountable for any damage""",2025-03-23,True,No proper communication and very casual app...,https://www.airlinequality.com/wp-content/uplo...,,Family Leisure,...,1.0,1.0,1.0,1.0,1.0,1,no,455e41f45426cc1ebd206caba0945f054613394cb208c5...,negative,0.902924
4,4,AirAsia X,2.0,"""flight is delay for 20 hours""",2025-03-24,True,"Worse experience, my flight suppose to depa...",,,Business,...,1.0,1.0,1.0,,,1,no,a13ee7a23ddc22c11deadf130dbe3b23384427e2fd5e01...,negative,0.841998


#### Load embeddings from previously generated .npz file

In [330]:
review_text_embeddings = None
review_categories_embeddings = None
EMBEDDING_BACKUP_FILE = f"data/{ETL_DATE}_Airline_Reviews_gte_small.npz"
if os.path.exists(EMBEDDING_BACKUP_FILE):
    # Load the .npz file
    airline_quality_embeddings = np.load(EMBEDDING_BACKUP_FILE)
    
    # List the variables stored in the file
    print(airline_quality_embeddings.files)
    
    # Access the individual arrays
    review_text_embeddings = airline_quality_embeddings['review_text_embeddings']
    review_categories_embeddings = airline_quality_embeddings['review_categories_embeddings']

    print("review_text_embeddings:", review_text_embeddings)
    print("review_categories_embeddings:", review_categories_embeddings)
    
    # Close the file after use
    airline_quality_embeddings.close()
else:
    airline_quality_embeddings = None
    
#airline_quality_embeddings

In [332]:
#review_text_embeddings[0:1]

#### Convert review text to embeddings using thenlper/gte-small

In [335]:
from sentence_transformers import SentenceTransformer

In [337]:
# To fix SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
df_embed = df[['Id', 'Review']].copy()

In [339]:
if review_text_embeddings is None:
    embedding_model = SentenceTransformer("thenlper/gte-small")
    review_text_embeddings = embedding_model.encode(df_embed['Review'])
else:
    review_text_embeddings = review_text_embeddings

review_text_embeddings[0:1]

array([[-6.98428554e-03, -1.25224106e-02,  5.70371859e-02,
        -2.61981282e-02, -2.54146010e-02, -1.28013790e-02,
         9.70249474e-02,  4.39044647e-02,  2.40684990e-02,
        -3.00901346e-02,  1.13103120e-02, -4.25093696e-02,
         4.07670476e-02,  1.47937676e-02, -2.02824385e-03,
         3.74056846e-02, -6.65258989e-03, -2.02855542e-02,
        -6.67541027e-02,  4.69416678e-02,  5.62669709e-02,
        -5.23261093e-02, -3.00242584e-02, -2.29025446e-02,
         1.29802851e-02,  5.47184274e-02, -8.13384578e-02,
        -1.87874231e-02, -6.72451556e-02, -1.59364313e-01,
        -3.04386783e-02, -2.27219500e-02,  4.10974100e-02,
        -3.19892168e-02,  4.45738509e-02, -8.45515635e-03,
        -1.22857271e-02,  3.27443033e-02,  9.37276811e-04,
         4.86437976e-02,  3.65212634e-02,  1.86644942e-02,
        -1.75149962e-02, -4.50793244e-02, -3.34730446e-02,
        -3.23594138e-02,  1.73729006e-02,  4.68895212e-02,
         7.84255117e-02, -8.00581723e-02,  2.79602241e-0

In [340]:
df_embed['review_gte_small_embeddings'] = review_text_embeddings.tolist()
print(len(df_embed))
df_embed.head()

52


Unnamed: 0,Id,Review,review_gte_small_embeddings
0,661172569262a3a1c15d87e9459a0fefc8664914266518...,Absolutely terrible customer service. Tried t...,"[-0.006984285544604063, -0.012522410601377487,..."
1,925fcfda402034e92b439424f9eb35bec83154e497e641...,My son was travelling on Kids Solo as an un...,"[-0.036641675978899, 0.022963620722293854, 0.0..."
2,c09a8175ab7eda10f0fddbbdc6227c44836d964901af2c...,Recently travelled on Air India flight from T...,"[0.0173887200653553, -0.020334232598543167, 0...."
3,455e41f45426cc1ebd206caba0945f054613394cb208c5...,No proper communication and very casual app...,"[-0.009622704237699509, 0.012122679501771927, ..."
4,a13ee7a23ddc22c11deadf130dbe3b23384427e2fd5e01...,"Worse experience, my flight suppose to depa...","[0.025981714949011803, -0.01533507276326418, 0..."


#### Convert review category/ies to embeddings using thenlper/gte-small

In [342]:
review_categories = ['Lost Luggage']
    #, 'Cabin Crew Service', 'Ground Crew Service', 
    #'Seat Comfort', 'In-flight Amenities', 'Safety', 'Cleanliness',
    #'On-time Performance', 'Value for money']

In [343]:
df_review_categories = pd.DataFrame(review_categories, columns=['review_category'])
print(len(df_review_categories))
df_review_categories.head()

1


Unnamed: 0,review_category
0,Lost Luggage


In [344]:
if review_categories_embeddings is None:
    embedding_model = SentenceTransformer("thenlper/gte-small")
    review_categories_embeddings = embedding_model.encode(df_review_categories['review_category'])
else:
    review_categories_embeddings = review_categories_embeddings
    
review_categories_embeddings[0:1]

array([[-1.45721631e-02, -1.47483693e-02,  6.21146150e-02,
        -3.49040627e-02,  2.87266634e-02,  2.60636304e-02,
         8.35142434e-02,  2.47845184e-02, -3.39080244e-02,
        -8.66788160e-03,  2.98559386e-02, -5.27598150e-02,
         7.11026862e-02,  5.20410687e-02,  1.13727460e-02,
         5.30625042e-03,  1.43594220e-02,  1.67635325e-02,
        -8.92897472e-02,  3.19607370e-02,  1.40563874e-02,
        -3.45494412e-02, -7.90021271e-02, -1.59687418e-02,
        -7.97322858e-03,  1.69071723e-02, -2.01837160e-02,
        -3.29448655e-02, -5.25507778e-02, -2.15803638e-01,
        -8.31609964e-03, -7.47866556e-02,  1.21304998e-03,
        -3.39180306e-02,  1.88246649e-02, -3.53139592e-03,
        -1.53122824e-02,  2.21802033e-02,  4.79664514e-03,
         2.83719748e-02,  4.51335721e-02,  2.14884859e-02,
         7.38404412e-03, -3.94356102e-02, -4.01152596e-02,
        -5.14237583e-02, -4.85760951e-03, -1.17152985e-02,
         9.37279314e-02, -8.49065036e-02,  1.74151380e-0

In [345]:
df_review_categories['review_category_gte_small_embeddings'] = review_categories_embeddings.tolist()
df_review_categories[['review_category', 'review_category_gte_small_embeddings']]

Unnamed: 0,review_category,review_category_gte_small_embeddings
0,Lost Luggage,"[-0.014572163112461567, -0.014748369343578815,..."


#### Double check review category embeddings mapping

In [347]:
embedding_model = SentenceTransformer("thenlper/gte-small")
embedding_model.encode(df_review_categories['review_category'][0]).tolist()

[-0.014572163112461567,
 -0.014748369343578815,
 0.062114614993333817,
 -0.03490406274795532,
 0.028726663440465927,
 0.02606363035738468,
 0.0835142433643341,
 0.024784518405795097,
 -0.03390802443027496,
 -0.008667881600558758,
 0.029855938628315926,
 -0.052759815007448196,
 0.07110268622636795,
 0.05204106867313385,
 0.011372745968401432,
 0.005306250415742397,
 0.014359422028064728,
 0.016763532534241676,
 -0.08928974717855453,
 0.03196073696017265,
 0.014056387357413769,
 -0.03454944118857384,
 -0.07900212705135345,
 -0.015968741849064827,
 -0.007973228581249714,
 0.01690717227756977,
 -0.020183715969324112,
 -0.03294486552476883,
 -0.0525507777929306,
 -0.21580363810062408,
 -0.008316099643707275,
 -0.07478665560483932,
 0.0012130499817430973,
 -0.03391803056001663,
 0.018824664875864983,
 -0.003531395923346281,
 -0.015312282368540764,
 0.02218020334839821,
 0.004796645138412714,
 0.02837197482585907,
 0.045133572071790695,
 0.021488485857844353,
 0.0073840441182255745,
 -0.03943

In [348]:
df_review_categories['review_category_gte_small_embeddings'][0]

[-0.014572163112461567,
 -0.014748369343578815,
 0.062114614993333817,
 -0.03490406274795532,
 0.028726663440465927,
 0.02606363035738468,
 0.0835142433643341,
 0.024784518405795097,
 -0.03390802443027496,
 -0.008667881600558758,
 0.029855938628315926,
 -0.052759815007448196,
 0.07110268622636795,
 0.05204106867313385,
 0.011372745968401432,
 0.005306250415742397,
 0.014359422028064728,
 0.016763532534241676,
 -0.08928974717855453,
 0.03196073696017265,
 0.014056387357413769,
 -0.03454944118857384,
 -0.07900212705135345,
 -0.015968741849064827,
 -0.007973228581249714,
 0.01690717227756977,
 -0.020183715969324112,
 -0.03294486552476883,
 -0.0525507777929306,
 -0.21580363810062408,
 -0.008316099643707275,
 -0.07478665560483932,
 0.0012130499817430973,
 -0.03391803056001663,
 0.018824664875864983,
 -0.003531395923346281,
 -0.015312282368540764,
 0.02218020334839821,
 0.004796645138412714,
 0.02837197482585907,
 0.045133572071790695,
 0.021488485857844353,
 0.0073840441182255745,
 -0.03943

In [349]:
#import numpy as np

# Double check review category embeddings mapping
assert np.allclose(
    embedding_model.encode(df_review_categories['review_category'][0]).tolist(), 
    df_review_categories['review_category_gte_small_embeddings'][0],
    rtol=1e-3, atol=1e-4 #increase to allow greater deviation
    #rtol=1e-7, atol=1e-8 #decrease for stricter precision
), "Embeddings do not match within tolerance!"

#### Backup embeddings to a .npz file

In [351]:
np.savez(
    EMBEDDING_BACKUP_FILE,
    review_text_embeddings=review_text_embeddings,
    review_categories_embeddings=review_categories_embeddings
)

#### Compute cosine similarity for review category

In [353]:
review_category = 'Lost Luggage'
review_category_tag = 'lost_luggage'

In [354]:
#from sklearn.metrics.pairwise import cosine_similarity

In [355]:
category_embedding = df_review_categories[df_review_categories['review_category'] \
    == review_category]['review_category_gte_small_embeddings'].values[0]
#category_embedding

In [356]:
#df_embed['review_gte_small_embeddings'][0]

In [357]:
# Compute cosine similarity for each row
df_embed[f'{review_category_tag}_cosim'] = df_embed['review_gte_small_embeddings'].apply(
    lambda row_embedding: cosine_similarity(
        np.array(category_embedding).reshape(1, -1),
        np.array(row_embedding).reshape(1, -1)
    )[0, 0]
)

#### Compute kneighbors distance for review category

In [359]:
#from sklearn.neighbors import NearestNeighbors

In [360]:
# Function to normalize a vector (to remove difference in magnitude of phrase vs paragraph
def normalize_vector(vector):
    norm = np.linalg.norm(vector)  # Compute the norm (magnitude)
    if norm == 0:  # Avoid division by zero
        return vector
    return vector / norm

In [361]:
df_embed['review_gte_small_embeddings']

0     [-0.006984285544604063, -0.012522410601377487,...
1     [-0.036641675978899, 0.022963620722293854, 0.0...
2     [0.0173887200653553, -0.020334232598543167, 0....
3     [-0.009622704237699509, 0.012122679501771927, ...
4     [0.025981714949011803, -0.01533507276326418, 0...
5     [0.009470708668231964, -0.0003374788211658597,...
6     [-0.01932797208428383, -0.029321730136871338, ...
7     [-0.02123030461370945, 0.03313056007027626, 0....
8     [-0.005881058983504772, -0.00143251265399158, ...
9     [-0.034532807767391205, 0.008047778159379959, ...
10    [-0.025037143379449844, 0.011341938748955727, ...
11    [-0.017263149842619896, -0.010811987332999706,...
12    [-0.0027542314492166042, 0.003345365868881345,...
13    [0.01381221879273653, 0.011859861202538013, 0....
14    [-0.014738035388290882, 0.01641179248690605, 0...
15    [0.018291329964995384, 0.038863785564899445, 0...
16    [0.020274097099900246, 0.0022986673284322023, ...
17    [-0.025463983416557312, -0.021118547767400

In [366]:
np.vstack(df_embed['review_gte_small_embeddings'].values)

array([[-0.00698429, -0.01252241,  0.05703719, ..., -0.03012245,
        -0.00372658,  0.00184007],
       [-0.03664168,  0.02296362,  0.06297143, ..., -0.03217833,
         0.01489523,  0.03898227],
       [ 0.01738872, -0.02033423,  0.07884842, ..., -0.0335802 ,
        -0.0046224 ,  0.04279783],
       ...,
       [-0.03161675, -0.00272595,  0.03586897, ..., -0.01866342,
         0.00134726,  0.04691767],
       [-0.01246334, -0.00630447,  0.03338405, ..., -0.13079868,
        -0.00736088,  0.04890582],
       [-0.03313306,  0.00826479,  0.04247745, ..., -0.03067465,
        -0.03462286,  0.0175295 ]])

In [367]:
normalize_vector(np.vstack(df_embed['review_gte_small_embeddings'].values))

array([[-0.00096855, -0.00173655,  0.00790963, ..., -0.00417723,
        -0.00051678,  0.00025517],
       [-0.00508129,  0.00318448,  0.00873257, ..., -0.00446233,
         0.0020656 ,  0.00540587],
       [ 0.00241138, -0.00281985,  0.01093431, ..., -0.00465674,
        -0.00064101,  0.00593499],
       ...,
       [-0.00438445, -0.00037802,  0.00497413, ..., -0.00258815,
         0.00018683,  0.00650631],
       [-0.00172835, -0.00087427,  0.00462953, ..., -0.01813851,
        -0.00102077,  0.00678202],
       [-0.00459473,  0.00114612,  0.00589056, ..., -0.00425381,
        -0.00480133,  0.0024309 ]])

In [368]:
# Need to be done only once
nn = NearestNeighbors(n_neighbors=len(df_embed), metric='minkowski')
nn.fit(np.vstack(df_embed['review_gte_small_embeddings'].values)) 
#nn.fit(normalize_vector(np.vstack(df_embed['review_gte_small_embeddings'].values))) 

In [369]:
distances, indices = nn.kneighbors(np.array(category_embedding).reshape(1, -1))
#distances, indices = nn.kneighbors(normalize_vector(np.array(category_embedding).reshape(1, -1)))
print(distances)
print(indices)

[[0.46878229 0.51480828 0.53093253 0.59881456 0.60083433 0.60472409
  0.60721252 0.60846331 0.6088974  0.61407781 0.61586843 0.61624029
  0.61632836 0.61680897 0.62042588 0.62263853 0.62308501 0.62314129
  0.62730452 0.62817659 0.62899864 0.6290269  0.63015868 0.63073087
  0.63073516 0.63176036 0.63350929 0.63815104 0.63885569 0.64311374
  0.64333274 0.64688024 0.64688625 0.64721661 0.64911286 0.64930776
  0.65035239 0.65172117 0.65212117 0.65299081 0.65348044 0.65593836
  0.65603552 0.6619919  0.66303471 0.66314923 0.66515735 0.66593044
  0.66755355 0.66863969 0.68917201 0.69571467]]
[[ 3 21  9 51 23 44 32  6  1 26 11 22 18 29 19 50 49 24 36 27 28  4 42 41
  46  2 15  8 17 35 31 14 38  5 33 13 37 39  7 10  0 45 20 25 40 12 30 34
  16 48 43 47]]


In [370]:
index_kndistance_mapping = dict(zip(indices[0], distances[0]))
#index_kndistance_mapping

In [371]:
# Assign kneighbors distances for each row
df_embed[f'{review_category_tag}_kndist'] = df_embed.index.map(index_kndistance_mapping)

#### Verify threshold for cosine similarity / kneighbors distance for review category

In [375]:
category_threshold = 0.84252

df_embed[df_embed[f'{review_category_tag}_cosim'] >= category_threshold]\
[['Id','Review',f'{review_category_tag}_cosim',f'{review_category_tag}_kndist']]\
    .sort_values(by=[f'{review_category_tag}_cosim'], ascending=True)
#0.843110 cutoff

Unnamed: 0,Id,Review,lost_luggage_cosim,lost_luggage_kndist
9,2993d295eabf0f36f735645e909cd369ced9725e9e47fa...,"Our flight from Coron to Manila, originally...",0.859055,0.530933
21,4c80a81b1120ea61c6dd0cf50682b5b222db872025c108...,Avoid this airline at all costs. I booked a...,0.867486,0.514808
3,455e41f45426cc1ebd206caba0945f054613394cb208c5...,No proper communication and very casual app...,0.890122,0.468782


In [376]:
# Random check a particular review
df_embed[df_embed.index == 12159]\
[['Id','Review',f'{review_category_tag}_cosim',f'{review_category_tag}_kndist']].to_string()
#false positives: 129, 18850, 7268

'Empty DataFrame\nColumns: [Id, Review, lost_luggage_cosim, lost_luggage_kndist]\nIndex: []'

In [377]:
df_embed[df_embed[f'{review_category_tag}_kndist'] <= 0.560]\
[['Id','Review',f'{review_category_tag}_cosim',f'{review_category_tag}_kndist']]\
    .sort_values(by=[f'{review_category_tag}_kndist'], ascending=False)
#0.56015 cutoff (secondary metric only after cosim)

Unnamed: 0,Id,Review,lost_luggage_cosim,lost_luggage_kndist
9,2993d295eabf0f36f735645e909cd369ced9725e9e47fa...,"Our flight from Coron to Manila, originally...",0.859055,0.530933
21,4c80a81b1120ea61c6dd0cf50682b5b222db872025c108...,Avoid this airline at all costs. I booked a...,0.867486,0.514808
3,455e41f45426cc1ebd206caba0945f054613394cb208c5...,No proper communication and very casual app...,0.890122,0.468782


#### Add review_category_flag based on threshold

In [379]:
df_embed[f'is_{review_category_tag}_flag'] = df_embed[f'{review_category_tag}_cosim'] >= category_threshold

In [380]:
print(len(df_embed))
df_embed.head()

52


Unnamed: 0,Id,Review,review_gte_small_embeddings,lost_luggage_cosim,lost_luggage_kndist,is_lost_luggage_flag
0,661172569262a3a1c15d87e9459a0fefc8664914266518...,Absolutely terrible customer service. Tried t...,"[-0.006984285544604063, -0.012522410601377487,...",0.786482,0.65348,False
1,925fcfda402034e92b439424f9eb35bec83154e497e641...,My son was travelling on Kids Solo as an un...,"[-0.036641675978899, 0.022963620722293854, 0.0...",0.814622,0.608897,False
2,c09a8175ab7eda10f0fddbbdc6227c44836d964901af2c...,Recently travelled on Air India flight from T...,"[0.0173887200653553, -0.020334232598543167, 0....",0.800439,0.63176,False
3,455e41f45426cc1ebd206caba0945f054613394cb208c5...,No proper communication and very casual app...,"[-0.009622704237699509, 0.012122679501771927, ...",0.890122,0.468782,True
4,a13ee7a23ddc22c11deadf130dbe3b23384427e2fd5e01...,"Worse experience, my flight suppose to depa...","[0.025981714949011803, -0.01533507276326418, 0...",0.802163,0.629027,False


#### Merge back review text labels to original dataframe

In [382]:
df_merged = df.merge(df_embed, how='left', on='Id', suffixes=['','_embed']).drop(columns=['Review_embed'])
len(df_merged)
df_merged.head()

Unnamed: 0,RowId,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Top Review Image Url,Aircraft,Type Of Traveller,...,Wifi & Connectivity,Value For Money,Recommended,Id,sentiment_label,sentiment_scores,review_gte_small_embeddings,lost_luggage_cosim,lost_luggage_kndist,is_lost_luggage_flag
0,0,Aer Lingus,2.0,“terrible customer service”,2025-03-26,False,Absolutely terrible customer service. Tried t...,,,Couple Leisure,...,3.0,1,no,661172569262a3a1c15d87e9459a0fefc8664914266518...,negative,0.94876,"[-0.006984285544604063, -0.012522410601377487,...",0.786482,0.65348,False
1,1,Air France,10.0,"""excellent customer service""",2025-03-28,True,My son was travelling on Kids Solo as an un...,,,Family Leisure,...,,5,yes,925fcfda402034e92b439424f9eb35bec83154e497e641...,neutral,0.701676,"[-0.036641675978899, 0.022963620722293854, 0.0...",0.814622,0.608897,False
2,2,Air India,3.0,“it was not a pleasant experience”,2025-03-26,False,Recently travelled on Air India flight from T...,,,Family Leisure,...,1.0,1,no,c09a8175ab7eda10f0fddbbdc6227c44836d964901af2c...,negative,0.710657,"[0.0173887200653553, -0.020334232598543167, 0....",0.800439,0.63176,False
3,3,Air India Express,1.0,"""not accountable for any damage""",2025-03-23,True,No proper communication and very casual app...,https://www.airlinequality.com/wp-content/uplo...,,Family Leisure,...,1.0,1,no,455e41f45426cc1ebd206caba0945f054613394cb208c5...,negative,0.902924,"[-0.009622704237699509, 0.012122679501771927, ...",0.890122,0.468782,True
4,4,AirAsia X,2.0,"""flight is delay for 20 hours""",2025-03-24,True,"Worse experience, my flight suppose to depa...",,,Business,...,,1,no,a13ee7a23ddc22c11deadf130dbe3b23384427e2fd5e01...,negative,0.841998,"[0.025981714949011803, -0.01533507276326418, 0...",0.802163,0.629027,False


#### Persist to CSV file

In [384]:
df_merged.to_csv(f"data/silver_{ETL_DATE}_Airline_Reviews_LostLuggageLabel.csv", index=False, date_format='%Y-%m-%d')