In [1]:


import numpy as np
import pandas as pd
import re

def drop_date_rows(df: pd.DataFrame, text_column: str = 'cust_review_text') -> pd.DataFrame:
    """
    Drops rows where the "text" column contains "Date of experience" followed by a date (Month Day, Year).

    Parameters:
        df (pd.DataFrame): DataFrame with 'text' column.
        text_column (str, optional): Name of the text column. Defaults to 'text'.

    Returns:
        pd.DataFrame: The DataFrame with rows containing dates dropped
    """
    date_pattern = re.compile(r'Date of experience:\s*\w+\s*\d{1,2},\s*\d{4}', re.IGNORECASE)
    
    # Identify rows with the date pattern
    matches = df[text_column].apply(lambda text: bool(re.search(date_pattern, text)))
    
    # Invert to select rows that should be kept
    df_filtered = df[~matches]

    return df_filtered



pd.set_option('display.width', None)







df = pd.read_csv('/home/aboveclouds49/project/trustpilot_reviews.csv')
def sample_classes(df, target_column, n_samples):
    # Initialize an empty list to store sampled rows
    sampled_rows = []
    
    # Iterate over each class (1-5)
    for class_value in range(1, 6):
        # Get rows for the current class
        class_rows = df[df[target_column] == class_value]
        
        # Sample min(n_samples, number of rows in the class) rows
        sampled_class_rows = class_rows.sample(n=min(n_samples, len(class_rows)), random_state=42)
        
        # Add the sampled rows to the list
        sampled_rows.append(sampled_class_rows)
    
    # Combine all sampled rows into a single DataFrame
    sampled_df = pd.concat(sampled_rows)
    
    # Return the sampled DataFrame
    return sampled_df


df = sample_classes(df, 'cust_rating', n_samples=5000)[['cust_rating', 'cust_review_text']]
df.dropna(inplace=True)
df = drop_date_rows(df)
df.columns = ['rating', 'text']
df.head()


Unnamed: 0,rating,text
138345,1,Terrible! You sent a gate arm that was defecti...
8837,1,I would give negative stars if I could. I sent...
132569,1,left me with a broken generator on a service p...
81850,1,Received an email asking “how did we do?” Orde...
47741,1,I recently purchased our first French bulldog ...


In [2]:
from contractions import TextProcessor, TextProcessorConfig  # Import necessary classes

# Create a TextProcessor instance without providing a config
text_processor = TextProcessor()  # This will use the default TextProcessorConfig

# Now you can use the text_processor as before:
processed_df = text_processor.process_text_for_comparison(df, 'text')
text_processor.analyze_nouns(processed_df, 'modified_text', 'rating')



  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/aboveclouds49/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aboveclouds49/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/aboveclouds49/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/aboveclouds49/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/aboveclouds49/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!



Rating 1 - Top 10 nouns:
company: 1483
order: 1335
service: 1313
time: 1098
customer: 1049
money: 823
days: 655
day: 531
product: 528
business: 484

Rating 2 - Top 10 nouns:
order: 1422
time: 1217
service: 899
customer: 761
company: 703
days: 593
product: 546
day: 517
experience: 432
delivery: 410

Rating 3 - Top 10 nouns:
order: 1119
time: 1091
service: 712
product: 612
company: 494
customer: 476
delivery: 421
day: 412
days: 393
way: 367

Rating 4 - Top 10 nouns:
time: 780
service: 710
order: 699
product: 506
customer: 436
delivery: 410
experience: 406
company: 357
process: 339
quality: 325

Rating 5 - Top 10 nouns:
service: 1043
time: 797
customer: 617
experience: 582
order: 561
company: 497
process: 471
product: 455
quality: 437
everything: 383


In [3]:
import pandas as pd
from ngram_p2 import train_and_evaluate, NGramLabelAnalyzer


analyzer = NGramLabelAnalyzer(ngram_range=(1, 5))
try:
    results = analyzer.analyze(df, 'text', 'rating')

    # Display the results from NGramLabelAnalyzer
    print("\nN-gram Analysis Results:")
    for label, ngrams in results.items():
        print(f"\nTop n-grams for label '{label}':")
        for ngram, prob in ngrams:
            print(f"  {ngram}: {prob:.4f}")

except Exception as e:
    print(f"Error during n-gram analysis: {e}")

# Train and evaluate the model
try:
    model, label_encoder, vectorizer = train_and_evaluate(
        df,
        text_column='text',
        rating_column='rating',
        ngram_range=(1, 5)
    )
except Exception as e:
    print(f"Error during model training and evaluation: {e}")

2025-01-16 15:14:55,351 - ngram_p2 - INFO - Initialized NGramLabelAnalyzer with ngram_range=(1, 5)
2025-01-16 15:14:55,363 - ngram_p2 - INFO - Starting n-gram label analysis
2025-01-16 15:14:55,719 - ngram_p2 - INFO - Extracting n-grams with range (1, 5)
2025-01-16 15:15:02,968 - ngram_p2 - INFO - Successfully extracted 85925 n-grams
Computing label associations: 100%|██████████| 85925/85925 [35:54<00:00, 39.89it/s] 
2025-01-16 15:50:57,503 - ngram_p2 - INFO - Found 85925 n-grams for label 1
2025-01-16 15:50:58,017 - ngram_p2 - INFO - Found 85925 n-grams for label 2
2025-01-16 15:50:58,594 - ngram_p2 - INFO - Found 85925 n-grams for label 3
2025-01-16 15:50:59,191 - ngram_p2 - INFO - Found 85925 n-grams for label 4
2025-01-16 15:50:59,779 - ngram_p2 - INFO - Found 85925 n-grams for label 5
2025-01-16 15:50:59,787 - ngram_p2 - INFO - N-gram processing completed:
- Total n-grams: 85925
- Failed n-grams: 0 (0.00%)
- Final success rate: 100.00%
2025-01-16 15:51:00,025 - ngram_p2 - INFO - S


N-gram Analysis Results:

Top n-grams for label '1':
  00 called: 1.0000
  00 cash: 1.0000
  00 money: 1.0000
  000 000: 1.0000
  000 car: 1.0000
  000 dollars: 1.0000
  000 just: 1.0000
  000 loan: 1.0000
  000 offered: 1.0000
  000 super: 1.0000

Top n-grams for label '2':
  00 just: 1.0000
  00 time: 1.0000
  00 used: 1.0000
  000 gold: 1.0000
  10 00 credit: 1.0000
  10 15 minutes: 1.0000
  10 31: 1.0000
  10 additional: 1.0000
  10 long: 1.0000
  10 mins: 1.0000

Top n-grams for label '3':
  00 afternoon: 1.0000
  00 cost: 1.0000
  00 dollar: 1.0000
  00 poster: 1.0000
  10 days just: 1.0000
  10 mph: 1.0000
  10 return: 1.0000
  10 shipping: 1.0000
  11111111111: 1.0000
  11x17: 1.0000

Top n-grams for label '4':
  11 year old: 1.0000
  15 minutes time: 1.0000
  1x: 1.0000
  2024 didn: 1.0000
  25 product: 1.0000
  300 power: 1.0000
  3000 watt: 1.0000
  3000 watt inverter: 1.0000
  360 author solutions dream: 1.0000
  50 pounds: 1.0000

Top n-grams for label '5':
  10 10 buy: 1

2025-01-16 15:52:10,687 - ngram_p2 - INFO - Number of 1-grams: 3049
2025-01-16 15:52:10,693 - ngram_p2 - INFO - Number of 2-grams: 686
2025-01-16 15:52:10,694 - ngram_p2 - INFO - Number of 3-grams: 25
2025-01-16 15:52:10,696 - ngram_p2 - INFO - Number of 4-grams: 0
2025-01-16 15:52:10,697 - ngram_p2 - INFO - Number of 5-grams: 0
2025-01-16 15:52:10,699 - ngram_p2 - INFO - Encoded 5 unique labels
2025-01-16 15:52:33,313 - ngram_p2 - INFO - Cross-validation ROC-AUC scores: [0.81051234 0.80453773 0.80095889 0.80271486 0.79817281]
2025-01-16 15:52:33,315 - ngram_p2 - INFO - Mean ROC-AUC: 0.8034
2025-01-16 15:53:03,981 - ngram_p2 - INFO - 
Top 1-grams by Mutual Information:
2025-01-16 15:53:03,982 - ngram_p2 - INFO -   time: 0.2710
2025-01-16 15:53:03,982 - ngram_p2 - INFO -   service: 0.2549
2025-01-16 15:53:03,983 - ngram_p2 - INFO -   great: 0.2349
2025-01-16 15:53:03,984 - ngram_p2 - INFO -   order: 0.2328
2025-01-16 15:53:03,984 - ngram_p2 - INFO -   good: 0.2135
2025-01-16 15:53:03,98