# Finetune Data Preparing Process
1. [KeyBERT (all-MiniLM-L6-v2)](https://maartengr.github.io/KeyBERT/): 2-gram Keywords Extraction and Embedding

2. [Kmeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html): Topic Clustering with Embedding Vector

3. Reviews Selection: Select top 200 reviews from 10 topics.

4. [Grok2-latest(LLM)](https://x.ai/news/grok-2):  Summary Soft-Labeling.

In [136]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import json
load_dotenv('.env')

from keybert import KeyBERT

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

from tqdm.notebook import tqdm
from time import sleep


In [11]:
data = pd.read_csv('../data/silver_20250324_Airlines_Reviews_Sentiment.csv')
reviews_data  = pd.DataFrame(data.Review)
reviews_data

(24354, 23)

### 1. [KeyBERT (all-MiniLM-L6-v2)](https://maartengr.github.io/KeyBERT/): 2-gram Keywords Extraction and Embedding

In [None]:
# KeyBERT 2-gram keywords extraction and embedding
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
embedding_model = kw_model.model

def extract_keywords_and_embeddings(text, top_n=8):
    keywords_with_scores = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(2, 2),
        stop_words='english',
        use_mmr=True,
        top_n=top_n
    )
    keywords = [kw for kw, _ in keywords_with_scores]
    embeddings = embedding_model.embed(keywords)
    return keywords, embeddings

reviews_data[['keywords_2gram', 'keywords_2gram_emb']] = reviews_data['Review'].apply(
    lambda x: pd.Series(extract_keywords_and_embeddings(str(x)))
)

#reviews_data.to_csv('Airlines_Reviews_2gramEmbedding.csv')

### 2. [Kmeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html): Topic Clustering with Embedding Vector

In [None]:
# Use Average Embedding as Reviews Representative Vector
def average_embedding(emb_list):
  return np.mean(emb_list, axis=0)

reviews_data['review_vector'] = reviews_data['keywords_2gram_emb'].apply(average_embedding)
reviews_data.dropna(inplace=True)

review_vectors = np.vstack(reviews_data['review_vector'].values)




# Clustering
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(review_vectors)

reviews_data['cluster'] = cluster_labels

cluster_centers = kmeans.cluster_centers_

reviews_data['distance_to_center'] = [
    np.linalg.norm(vec - cluster_centers[label])
    for vec, label in zip(review_vectors, cluster_labels)
]



### 3. Reviews Selection: Select top 200 reviews from 10 topics.

In [125]:
TOP = 200
top_per_cluster = (
    reviews_data
    .sort_values(by='distance_to_center')
    .groupby('cluster')
    .head(TOP)
    .reset_index(drop=True)
)
top_per_cluster[['cluster', 'Review', 'distance_to_center']].shape

# top_per_cluster.to_csv('Reviews_For_Labeling.csv', index=False)

(2000, 3)

### 4. [Grok2-latest(LLM)](https://x.ai/news/grok-2):  Summary Soft-Labeling.

In [132]:
data_for_labeling  = pd.read_csv('Reviews_For_Labeling.csv')

# LLM Initialization
XAI_API_KEY = os.getenv("XAI_API_KEY")
client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1",)


# Use Batch to do Soft lableing
def _labeling_batch(review_list):
    batch_prompt = ""
    for i, review in enumerate(review_list):
        batch_prompt += f"\nReview {i+1}: {review}"

    system_message = {
        "role": "system",
        "content": '''
You are a helpful assistant that summarizes user reviews related to aviation (airlines or airports). Your goal is to write a **concise and informative summary** of each post, keeping the key points and tone.

Rules:
1. Output a JSON list with the summaries for each review.
2. Match the order exactly.
3. Keep each summary short (1-2 sentences).
4. Return JSON in format: {"summaries": ["...", "...", ...]}

''' + batch_prompt
    }

    try:
        completion = client.chat.completions.create(
            model="grok-2-latest",
            temperature=0,
            response_format={"type": "json_object"},
            messages=[system_message]
        )

        response = json.loads(completion.choices[0].message.content)
        return response["summaries"]

    except Exception as e:
        print(f"Error in batch labeling: {e}")
        return [""] * len(review_list)


# Execute Labeling Process
reviews = data_for_labeling.Review.tolist()
batch_size = 20
summaries = []
for i in tqdm(range(0, len(reviews), batch_size), ncols=100):
    batch = reviews[i:i + batch_size]
    batch_summaries = _labeling_batch(batch)
    summaries.extend(batch_summaries)
    sleep(1)
data_for_labeling['summary'] = summaries

# Data For Finetune
data_for_labeling.to_csv('Reviews_for_training.csv',index=False)



Unnamed: 0.1,Unnamed: 0,Review,keywords_2gram,keywords_2gram_emb,review_vector,cluster,distance_to_center
0,7272,"Solid airline, will fly again. Professional, ...","['flight delayed', 'boarding literally', 'rema...",[[ 6.78944439e-02 -4.25142385e-02 -2.41788756e...,[ 2.31793001e-02 2.13883575e-02 -1.60253830e-...,5,0.244511
1,20018,"Worst experience ever, the long delays, to r...","['flight cancelled', 'melbourne airport', 'cal...",[[ 0.04839371 -0.00492003 0.00085667 ... -0.0...,[ 1.53331067e-02 9.13699530e-03 -9.21477564e-...,6,0.245582
2,15726,I flew with NIKI this week from Vienna to Al...,"['flew niki', 'vienna alicante', 'expensive re...",[[-0.01471053 0.01716902 -0.02504953 ... -0.0...,[-2.02544983e-02 2.36782003e-02 -1.38547458e-...,3,0.246489
3,16527,Osaka to Hong Kong. I have flown Peach severa...,"['osaka hong', 'flown peach', 'cost airline', ...",[[ 0.02480116 0.00897194 0.02534627 ... -0.0...,[ 1.52944177e-02 1.75527278e-02 -8.45169462e-...,8,0.247793
4,11589,I had to change my flight route to Venice a ...,"['helpful flight', 'chisinau venice', 'drink s...",[[ 0.0440908 0.0415143 -0.00632484 ... 0.0...,[ 9.40260012e-04 9.34662111e-03 -1.31993005e-...,7,0.249408
...,...,...,...,...,...,...,...
1995,7684,Johannesburg to Durban return on Comair (Bri...,"['durban return', 'late flight', 'jnb bag', 'a...",[[-0.06542107 0.04507969 -0.0499328 ... -0.0...,[ 1.57586411e-02 3.99073912e-03 -3.65359411e-...,0,0.300459
1996,10887,CDG-TBS and TBS-AMS. Brezhnev era service and ...,"['cdg tbs', 'georgia recommend', 'airline trav...",[[-0.04120119 -0.00643476 0.03505153 ... -0.1...,[-1.72636751e-02 2.09203716e-02 -1.16959680e-...,0,0.300505
1997,10886,CDG-TBS and TBS-AMS. Brezhnev era service and ...,"['cdg tbs', 'georgia recommend', 'airline trav...",[[-0.04120119 -0.00643476 0.03505153 ... -0.1...,[-1.72636751e-02 2.09203716e-02 -1.16959680e-...,0,0.300505
1998,20932,Leon Bajio to Puerto Vallarta. Punctual airl...,"['bajio puerto', 'punctual airline', 'flight c...",[[ 0.0021889 0.03311267 -0.04355304 ... -0.0...,[ 1.26226731e-02 4.72505391e-02 -1.89940874e-...,0,0.300581
