In [1]:


import numpy as np
import pandas as pd
import re

def drop_date_rows(df: pd.DataFrame, text_column: str = 'cust_review_text') -> pd.DataFrame:
    """
    Drops rows where the "text" column contains "Date of experience" followed by a date (Month Day, Year).

    Parameters:
        df (pd.DataFrame): DataFrame with 'text' column.
        text_column (str, optional): Name of the text column. Defaults to 'text'.

    Returns:
        pd.DataFrame: The DataFrame with rows containing dates dropped
    """
    date_pattern = re.compile(r'Date of experience:\s*\w+\s*\d{1,2},\s*\d{4}', re.IGNORECASE)
    
    # Identify rows with the date pattern
    matches = df[text_column].apply(lambda text: bool(re.search(date_pattern, text)))
    
    # Invert to select rows that should be kept
    df_filtered = df[~matches]

    return df_filtered


pd.set_option('display.width', None)







df = pd.read_csv('/home/aboveclouds49/project/trustpilot_reviews.csv')

def sample_classes(df, target_column, n_samples):
    # Initialize an empty list to store sampled rows
    sampled_rows = []
    
    # Iterate over each class (1-5)
    for class_value in range(1, 6):
        # Get rows for the current class
        class_rows = df[df[target_column] == class_value]
        
        # Sample min(n_samples, number of rows in the class) rows
        sampled_class_rows = class_rows.sample(n=min(n_samples, len(class_rows)), random_state=42)
        
        # Add the sampled rows to the list
        sampled_rows.append(sampled_class_rows)
    
    # Combine all sampled rows into a single DataFrame
    sampled_df = pd.concat(sampled_rows)
    
    # Return the sampled DataFrame
    return sampled_df

df = sample_classes(df, 'cust_rating', n_samples=5000)[['cust_rating', 'cust_review_text']]
df.dropna(inplace=True)
df = drop_date_rows(df)
df.columns = ['rating', 'text']
df.head()


Unnamed: 0,rating,text
138345,1,Terrible! You sent a gate arm that was defecti...
8837,1,I would give negative stars if I could. I sent...
132569,1,left me with a broken generator on a service p...
81850,1,Received an email asking “how did we do?” Orde...
47741,1,I recently purchased our first French bulldog ...


In [2]:
from ner import TextAnalyzer, SentimentAnalysisResults
import os
import warnings

# Force CPU execution
os.environ['CUDA_VISIBLE_DEVICES'] = ''  # Disable GPU visibility
os.environ['OMP_NUM_THREADS'] = '1'  # Limit OpenMP threads
os.environ['MKL_NUM_THREADS'] = '1'  # Limit MKL threads

# Suppress the specific MPS warning
warnings.filterwarnings("ignore", message="UserWarning: Skipping device NVIDIA GeForce GTX 780M that does not support Metal 2.0 (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1711403213615/work/aten/src/ATen/mps/MPSDevice.mm:101.)return torch._C._mps_is_available()")

In [3]:
from ner import TextAnalyzer, SentimentAnalysisResults

analyzer = TextAnalyzer()  # You can specify num_processes if needed
results = await analyzer.train_and_analyze(df, text_column='text', rating_column='rating', batch_size=100)

2025-01-16 13:40:28,479 - INFO - Starting sentiment analysis pipeline
Preprocessing texts: 100%|██████████| 100/100 [00:25<00:00,  3.92it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:25<00:00,  3.89it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.77it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.78it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.83it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.76it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:25<00:00,  3.96it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.73it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.72it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.71it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:25<00:00,  3.88it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:25<00:00,  3.89it/s]
Preprocessing texts: 100%|██████████| 100/100 [00:26<00:00,  3.75it/s]
Preproc

In [4]:
print("Classification Report:\n", results.classification_report)
print("\nConfusion Matrix:\n", results.confusion_matrix)
print("\nFeature Importance (Top 10):\n", list(results.feature_importance.items())[:10])
print("\nRating Distribution:\n", results.rating_distribution)
print("\nText Length Statistics:\n", results.text_length_stats)
print("\nSentiment by Length:\n", results.sentiment_by_length)

# To access the predictions and probabilities:
predictions = results.predictions
probabilities = results.probabilities

Classification Report:
 {'1': {'precision': 0.17611336032388664, 'recall': 0.18031088082901556, 'f1-score': 0.1781874039938556, 'support': 965.0}, '2': {'precision': 0.21226874391431352, 'recall': 0.22336065573770492, 'f1-score': 0.21767348976535197, 'support': 976.0}, '3': {'precision': 0.22111553784860558, 'recall': 0.2286302780638517, 'f1-score': 0.22481012658227847, 'support': 971.0}, '4': {'precision': 0.18157181571815717, 'recall': 0.14955357142857142, 'f1-score': 0.16401468788249693, 'support': 896.0}, '5': {'precision': 0.21083172147001933, 'recall': 0.22177009155645983, 'f1-score': 0.21616261774913237, 'support': 983.0}, 'accuracy': 0.2016280525986224, 'macro avg': {'precision': 0.20038023585499648, 'recall': 0.2007250955231207, 'f1-score': 0.20016966519462306, 'support': 4791.0}, 'weighted avg': {'precision': 0.20074364498276237, 'recall': 0.2016280525986224, 'f1-score': 0.20082150226811957, 'support': 4791.0}}

Confusion Matrix:
 [[174 201 208 177 205]
 [224 218 205 118 211]