import pandas as pd

In [None]:
data = pd.read_csv('./data/tourism_reviews.csv')

In [None]:
data.head()

In [None]:

import time
from pyabsa import AspectTermExtraction as ATEPC
def load_pyabsa_model():
    try:
        print("Loading PyABSA multilingual model...")
        extractor = ATEPC.AspectExtractor('multilingual', auto_device=True)
        print("PyABSA model loaded successfully.")
        return extractor
    except Exception as e:
        print(f"Failed to load PyABSA model: {e}")
        return None
    
extractor = load_pyabsa_model()

def extract_aspects_from_batch(extractor, texts):
    try:
        start_time = time.time()
        # Extract aspects and sentiments from the batch
        results = extractor.predict(
            texts, 
            print_result=False,
            save_result=False,
            ignore_error=True,
            pred_sentiment=True
        )
        
        processing_time_ms = int((time.time() - start_time) * 1000)
        
        if not results or len(results) == 0:
            print(f"No results found for texts: {texts}")
            return []
        
        flattened_results = []
        for text_idx, pyabsa_result in enumerate(results):
            for i, aspect in enumerate(pyabsa_result['aspect']):
                sentiment = pyabsa_result['sentiment'][i] if i < len(pyabsa_result['sentiment']) else 'Neutral'
                confidence = pyabsa_result['confidence'][i] if i < len(pyabsa_result['confidence']) else 0.0
                evidence_span = ' '.join([pyabsa_result['tokens'][idx] for idx in pyabsa_result['position'][i]]) if pyabsa_result['position'] else aspect

                flattened_results.append({
                    'text_id': text_idx,
                    'aspect': aspect,
                    'evidence_span': evidence_span,
                    'polarity': sentiment,
                    'confidence': confidence,
                    'model': "pyabsa-multilingual",
                    'latency_ms': processing_time_ms
                })
        
        return flattened_results
    
    except Exception as e:
        print(f"Error during sentiment extraction: {e}")
        return []

# Split the dataframe into batches of texts
batch_size = 100
texts_batch = [data.iloc[i:i+batch_size]['text_for_analysis'].tolist() for i in range(0, len(data), batch_size)]

# Process the batches
all_results = []
for batch in texts_batch:
    result = extract_aspects_from_batch(extractor, batch)
    all_results.extend(result)


In [None]:
aspect_df = pd.DataFrame(all_results)

In [None]:
aspect_df.to_csv("./data/tourism_reviews_aspect_sentiment.csv", index=False)

In [None]:
# Add index as a column to the data DataFrame (this will be used later for merging)
data['text_id'] = data.index

In [None]:
# Convert both 'text_id' columns to string (or int if you prefer)
data['text_id'] = data['text_id'].astype(str)  # Convert data's 'text_id' to string
aspect_df['text_id'] = aspect_df['text_id'].astype(str)  # Convert aspect_df's 'text_id' to string

# Now, proceed with the merge
merged_df = pd.merge(data, aspect_df, on='text_id', how='left')

merged_df.head()


In [None]:
merged_df.to_csv("./data/prepared_dataset.csv", index=False)