In [1]:


import numpy as np
import pandas as pd
import re

def drop_date_rows(df: pd.DataFrame, text_column: str = 'cust_review_text') -> pd.DataFrame:
    """
    Drops rows where the "text" column contains "Date of experience" followed by a date (Month Day, Year).

    Parameters:
        df (pd.DataFrame): DataFrame with 'text' column.
        text_column (str, optional): Name of the text column. Defaults to 'text'.

    Returns:
        pd.DataFrame: The DataFrame with rows containing dates dropped
    """
    date_pattern = re.compile(r'Date of experience:\s*\w+\s*\d{1,2},\s*\d{4}', re.IGNORECASE)
    
    # Identify rows with the date pattern
    matches = df[text_column].apply(lambda text: bool(re.search(date_pattern, text)))
    
    # Invert to select rows that should be kept
    df_filtered = df[~matches]

    return df_filtered



pd.set_option('display.width', None)







df = pd.read_csv('/home/aboveclouds49/project/trustpilot_reviews.csv')
def sample_classes(df, target_column, n_samples):
    # Initialize an empty list to store sampled rows
    sampled_rows = []
    
    # Iterate over each class (1-5)
    for class_value in range(1, 6):
        # Get rows for the current class
        class_rows = df[df[target_column] == class_value]
        
        # Sample min(n_samples, number of rows in the class) rows
        sampled_class_rows = class_rows.sample(n=min(n_samples, len(class_rows)), random_state=42)
        
        # Add the sampled rows to the list
        sampled_rows.append(sampled_class_rows)
    
    # Combine all sampled rows into a single DataFrame
    sampled_df = pd.concat(sampled_rows)
    
    # Return the sampled DataFrame
    return sampled_df


df = sample_classes(df, 'cust_rating', n_samples=5000)[['cust_rating', 'cust_review_text']]
df.dropna(inplace=True)
df = drop_date_rows(df)
df.columns = ['rating', 'text']
df.head()


Unnamed: 0,rating,text
138345,1,Terrible! You sent a gate arm that was defecti...
8837,1,I would give negative stars if I could. I sent...
132569,1,left me with a broken generator on a service p...
81850,1,Received an email asking “how did we do?” Orde...
47741,1,I recently purchased our first French bulldog ...


In [2]:
import pandas as pd
from tfidf_opt import (
    run_optimizations,
    NLTK_STOP_WORDS,
    SPACY_STOP_WORDS,
    CUSTOM_STOP_WORDS
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
NGRAM_RANGE = (1, 5)  # Example: Use up to trigrams

text_col = 'text'
rating_col = 'rating'
n_trials = 50  
num_processes = 4  
results_file = 'tfidf_opt_results.csv'  # Choose a different file name if desired

run_optimizations(df, text_col, rating_col, n_trials, num_processes, results_file)

Optimizing TF-IDF:   0%|          | 0/6 [00:00<?, ?it/s][I 2025-01-16 15:16:13,448] A new study created in memory with name: nltk_custom
[I 2025-01-16 15:16:13,498] A new study created in memory with name: nltk_baseline
  return bound(*args, **kwds)
[I 2025-01-16 15:16:13,589] A new study created in memory with name: spacy_custom
  return bound(*args, **kwds)
Processing Batches:   0%|          | 0/96 [00:00<?, ?it/s][I 2025-01-16 15:16:13,708] A new study created in memory with name: spacy_baseline
Processing Batches: 100%|██████████| 96/96 [00:07<00:00, 13.20it/s]
Processing Batches: 100%|██████████| 96/96 [00:07<00:00, 12.89it/s]
[I 2025-01-16 15:16:23,552] Trial 0 finished with value: 0.6716682704890671 and parameters: {'max_features': 535, 'min_df': 0.006554924695665751, 'max_df': 0.951910224142177, 'analyzer': 'char', 'svd_method': 'none', 'semantic_threshold': 7.4916018822422075, 'num_features_to_select': 1642, 'n_components': 75}. Best is trial 0 with value: 0.6716682704890671.
