In [1]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
df = pd.read_csv('/Users/notagain/data_preprocessing/so_many_rev.csv')

# Special characters

In [2]:
import re
from collections import Counter
import functools
import logging
import pandas as pd

# Configure logging (optional)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Compile the regex pattern (do this ONCE)
NON_LETTER_REGEX = re.compile(r"(\w*)([^\w\s]+)(\w*)", re.IGNORECASE)


def _extract_non_letters_and_words(text):
    """Extracts non-letters and words from a single text (for vectorized apply)"""
    non_letter_counts = Counter()
    words_with_non_letters = set()

    for match in NON_LETTER_REGEX.finditer(text):
        prefix = match.group(1)
        non_letter = match.group(2)
        suffix = match.group(3)

        non_letter_counts[non_letter] += 1
        word = prefix + non_letter + suffix
        if word:
            words_with_non_letters.add(word.lower())

    return non_letter_counts, words_with_non_letters


def analyze_non_letters_vectorized(df, text_column='text'):
    """Analyzes using Pandas .apply for vectorized processing."""

    results = df[text_column].astype(str).apply(_extract_non_letters_and_words)

    # Combine results from each row
    all_non_letter_counts = Counter()
    all_words_with_non_letters = set()

    for non_letter_counts, words_with_non_letters in results:
        all_non_letter_counts.update(non_letter_counts)
        all_words_with_non_letters.update(words_with_non_letters)

    return all_non_letter_counts, list(all_words_with_non_letters)


def analyze_non_letters_batched(texts, batch_size=1000):
    """Analyzes a list of texts or a pandas Series (DataFrame column) to find
    and count non-letter occurrences and identify words containing those
    non-letters. Uses batch processing and caching for optimized performance.

    Args:
        texts: A list of strings or a pandas Series (DataFrame column) to analyze.
        batch_size: The number of texts to process in each batch.  Adjust based on memory
                    constraints and performance testing.

    Returns:
        A tuple: (Counter of non-letter occurrences, list of words with non-letters).
    """

    if isinstance(texts, pd.Series):
        texts = texts.tolist()  # Convert pandas Series to a list

    if not isinstance(texts, list):
        raise TypeError("Input 'texts' must be a list of strings or a pandas Series.")


    all_non_letter_counts = Counter()
    all_words_with_non_letters = set()


    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        try:
            non_letter_counts, words_with_non_letters = _process_batch(tuple(batch)) # Convert to tuple for caching
            all_non_letter_counts.update(non_letter_counts)
            all_words_with_non_letters.update(words_with_non_letters)
        except Exception as e:
            logging.error(f"Error processing batch {i // batch_size}: {e}")
            # Consider re-raising or handling the error differently based on your needs
            raise  # Re-raise to stop execution if critical, or continue if not.



    return all_non_letter_counts, list(all_words_with_non_letters)


@functools.lru_cache(maxsize=128)  # Adjust maxsize based on your needs
def _process_batch(batch):
    """
    Processes a batch of text to find and count non-letter occurrences
    and associated words.  Uses caching for performance.

    Args:
        batch: A list of strings (text chunks).

    Returns:
        A tuple: (Counter of non-letter occurrences, list of words with non-letters).
    """

    non_letter_counts = Counter()
    words_with_non_letters = set()  # Use a set for uniqueness

    for text in batch:
        # Find all non-letter characters and their surrounding words.
        matches = NON_LETTER_REGEX.finditer(text)

        for match in matches:
            prefix = match.group(1)
            non_letter = match.group(2)  # The actual non-letter character(s)
            suffix = match.group(3)
            
            non_letter_counts[non_letter] += 1

            # Combine prefix, non-letter, and suffix to get the full word (if any)
            word = prefix + non_letter + suffix
            if word:
                words_with_non_letters.add(word.lower())  # Lowercase for consistency

    return non_letter_counts, list(words_with_non_letters)




# Vectorized processing (if the DataFrame fits in memory)
non_letter_counts, words_with_non_letters = analyze_non_letters_vectorized(df, text_column='text')
print("Vectorized: Non-letter Counts:", non_letter_counts)
print("Vectorized: Words with Non-letters:", words_with_non_letters)

# Batched processing
non_letter_counts, words_with_non_letters = analyze_non_letters_batched(df['text'], batch_size=100)  # Small batch for example
print("Batched: Non-letter Counts:", non_letter_counts)
print("Batched: Words with Non-letters:", words_with_non_letters)

Vectorized: Non-letter Counts: Counter({'.': 218306, ',': 93037, "'": 31825, '’': 19837, '!': 14708, '-': 13434, '(': 7250, '"': 6615, '$': 6425, '/': 6068, ')': 4219, ':': 3601, '...': 2671, '?': 2606, '&': 2469, ').': 1890, '!!': 1650, '%': 1567, '“': 1559, '..': 1383, ';': 1355, '!!!': 1135, '”': 1072, '…': 846, '),': 739, '....': 673, '+': 549, '".': 522, '--': 476, '#': 445, '."': 384, '—': 290, '!!!!': 285, '*': 236, '.)': 230, '.....': 225, '”.': 219, '£': 196, '….': 186, '‘': 175, '??': 167, '",': 164, '.,': 149, '???': 141, '.”': 137, ':)': 132, '%.': 112, '!!!!!': 109, '!)': 108, ':(': 107, '($': 103, '@': 92, "'.": 81, '.-': 80, '......': 76, '…..': 71, '?!': 68, ',"': 68, '°': 67, '–': 66, '?)': 57, '????': 56, '**': 56, '”,': 55, '[': 53, '=': 53, '!).': 50, ']': 46, '!!!!!!': 44, '.(': 42, '~': 42, '👍': 42, '***': 41, '!"': 41, '?"': 41, '.......': 41, '$.': 37, '>': 36, "',": 35, '!!!!!!!': 34, '---': 34, '😊': 33, '-$': 32, ',,': 32, '´': 32, ':-': 31, '$$': 30, '%,': 29

In [3]:
non_letter_counts

Counter({'.': 218306,
         ',': 93037,
         "'": 31825,
         '’': 19837,
         '!': 14708,
         '-': 13434,
         '(': 7250,
         '"': 6615,
         '$': 6425,
         '/': 6068,
         ')': 4219,
         ':': 3601,
         '...': 2671,
         '?': 2606,
         '&': 2469,
         ').': 1890,
         '!!': 1650,
         '%': 1567,
         '“': 1559,
         '..': 1383,
         ';': 1355,
         '!!!': 1135,
         '”': 1072,
         '…': 846,
         '),': 739,
         '....': 673,
         '+': 549,
         '".': 522,
         '--': 476,
         '#': 445,
         '."': 384,
         '—': 290,
         '!!!!': 285,
         '*': 236,
         '.)': 230,
         '.....': 225,
         '”.': 219,
         '£': 196,
         '….': 186,
         '‘': 175,
         '??': 167,
         '",': 164,
         '.,': 149,
         '???': 141,
         '.”': 137,
         ':)': 132,
         '%.': 112,
         '!!!!!': 109,
         '!)': 108,
  

In [4]:
non_letter_counts, words_with_non_letters

(Counter({'.': 218306,
          ',': 93037,
          "'": 31825,
          '’': 19837,
          '!': 14708,
          '-': 13434,
          '(': 7250,
          '"': 6615,
          '$': 6425,
          '/': 6068,
          ')': 4219,
          ':': 3601,
          '...': 2671,
          '?': 2606,
          '&': 2469,
          ').': 1890,
          '!!': 1650,
          '%': 1567,
          '“': 1559,
          '..': 1383,
          ';': 1355,
          '!!!': 1135,
          '”': 1072,
          '…': 846,
          '),': 739,
          '....': 673,
          '+': 549,
          '".': 522,
          '--': 476,
          '#': 445,
          '."': 384,
          '—': 290,
          '!!!!': 285,
          '*': 236,
          '.)': 230,
          '.....': 225,
          '”.': 219,
          '£': 196,
          '….': 186,
          '‘': 175,
          '??': 167,
          '",': 164,
          '.,': 149,
          '???': 141,
          '.”': 137,
          ':)': 132,
          '%.': 112

In [5]:
import re
from collections import Counter
import functools
import logging
import pandas as pd

# Configure logging (optional)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Compile the regex pattern (do this ONCE)
NON_LETTER_REGEX = re.compile(r"(\w*)([^\w\s]+)(\w*)", re.IGNORECASE)

# Regex for cleaning specific patterns
CLEANING_REGEX = re.compile(
    r"""
    [^\w\s]  # Match any single character that's NOT a word character or whitespace

    |         # OR

    [^\w\s]{2,}  # Match 2 or more consecutive characters that are NOT word characters or whitespace

    |         # OR

    \.{2,}      # Match 2 or more consecutive periods

    |         # OR

    -+          # Match one or more consecutive hyphens

    |         # OR

    [!¡?]{2,}   # Match 2 or more consecutive exclamation marks, inverted exclamation marks, or question marks

    |         # OR

    ['‘’]{2,}  # Match 2 or more consecutive single quotes (of various types)
    """,
    re.VERBOSE | re.IGNORECASE
)


def clean_text(text):
    """Cleans a single text by removing targeted patterns."""
    return CLEANING_REGEX.sub("", text)  # Replace matched patterns with an empty string


def _extract_non_letters_and_words(text):
    """Extracts non-letters and words from a single text (for vectorized apply)"""
    non_letter_counts = Counter()
    words_with_non_letters = set()

    for match in NON_LETTER_REGEX.finditer(text):
        prefix = match.group(1)
        non_letter = match.group(2)
        suffix = match.group(3)

        non_letter_counts[non_letter] += 1
        word = prefix + non_letter + suffix
        if word:
            words_with_non_letters.add(word.lower())

    return non_letter_counts, words_with_non_letters


def analyze_non_letters_vectorized(df, text_column='text'):
    """Analyzes using Pandas .apply for vectorized processing."""
    # Apply cleaning BEFORE extracting non-letters
    df['cleaned_text'] = df[text_column].astype(str).apply(clean_text)

    results = df['cleaned_text'].apply(_extract_non_letters_and_words)  # operate on the cleaned text

    # Combine results from each row
    all_non_letter_counts = Counter()
    all_words_with_non_letters = set()

    for non_letter_counts, words_with_non_letters in results:
        all_non_letter_counts.update(non_letter_counts)
        all_words_with_non_letters.update(words_with_non_letters)

    return all_non_letter_counts, list(all_words_with_non_letters), df  # Return df for inspection

def analyze_non_letters_batched(texts, batch_size=1000):
    """Analyzes a list of texts or a pandas Series (DataFrame column) to find
    and count non-letter occurrences and identify words containing those
    non-letters. Uses batch processing and caching for optimized performance.

    Args:
        texts: A list of strings or a pandas Series (DataFrame column) to analyze.
        batch_size: The number of texts to process in each batch.  Adjust based on memory
                    constraints and performance testing.

    Returns:
        A tuple: (Counter of non-letter occurrences, list of words with non-letters).
    """

    if isinstance(texts, pd.Series):
        texts = texts.tolist()  # Convert pandas Series to a list

    if not isinstance(texts, list):
        raise TypeError("Input 'texts' must be a list of strings or a pandas Series.")


    all_non_letter_counts = Counter()
    all_words_with_non_letters = set()


    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        cleaned_batch = [clean_text(text) for text in batch]  # Clean each text in the batch

        try:
            non_letter_counts, words_with_non_letters = _process_batch(tuple(cleaned_batch)) # Convert to tuple for caching
            all_non_letter_counts.update(non_letter_counts)
            all_words_with_non_letters.update(words_with_non_letters)
        except Exception as e:
            logging.error(f"Error processing batch {i // batch_size}: {e}")
            # Consider re-raising or handling the error differently based on your needs
            raise  # Re-raise to stop execution if critical, or continue if not.



    return all_non_letter_counts, list(all_words_with_non_letters)

@functools.lru_cache(maxsize=128)  # Adjust maxsize based on your needs
def _process_batch(batch):
    """
    Processes a batch of text to find and count non-letter occurrences
    and associated words.  Uses caching for performance.

    Args:
        batch: A list of strings (text chunks).

    Returns:
        A tuple: (Counter of non-letter occurrences, list of words with non-letters).
    """

    non_letter_counts = Counter()
    words_with_non_letters = set()  # Use a set for uniqueness

    for text in batch:
        # Find all non-letter characters and their surrounding words.
        matches = NON_LETTER_REGEX.finditer(text)

        for match in matches:
            prefix = match.group(1)
            non_letter = match.group(2)  # The actual non-letter character(s)
            suffix = match.group(3)
            
            non_letter_counts[non_letter] += 1

            # Combine prefix, non-letter, and suffix to get the full word (if any)
            word = prefix + non_letter + suffix
            if word:
                words_with_non_letters.add(word.lower())  # Lowercase for consistency

    return non_letter_counts, list(words_with_non_letters)



# Vectorized processing (if the DataFrame fits in memory)
non_letter_counts, words_with_non_letters, df_result = analyze_non_letters_vectorized(df, text_column='text')
print("Vectorized: Non-letter Counts:", non_letter_counts)
print("Vectorized: Words with Non-letters:", words_with_non_letters)
print(df_result[['text', 'cleaned_text']])  # Display the original and cleaned text


# Batched processing
non_letter_counts, words_with_non_letters = analyze_non_letters_batched(df['text'], batch_size=100)  # Small batch for example
print("Batched: Non-letter Counts:", non_letter_counts)
print("Batched: Words with Non-letters:", words_with_non_letters)

Vectorized: Non-letter Counts: Counter()
Vectorized: Words with Non-letters: []
                                                    text  \
0      It's extremely pricey so I specifically asked ...   
1      Wrong part ordered at first by the salesman.  ...   
2      My order was canceled without explanation.  Re...   
3      The review Cookflights.com  is connected with ...   
4      Put $400 a month away into a bank account don'...   
...                                                  ...   
60870  Darmawan our driver was great. He even partici...   
60871  The Notary was very patient and attentive.  Th...   
60872  Rush hour traffic inhibited initial service bu...   
60873  Green City Pros delivered exceptional AC repla...   
60874  The salesperson’s understanding of what I was ...   

                                            cleaned_text  
0      Its extremely pricey so I specifically asked i...  
1      Wrong part ordered at first by the salesman  S...  
2      My order was ca

In [6]:
non_letter_counts

Counter()

In [7]:
words_with_non_letters

[]

In [14]:
non_letter_counts, words_with_non_letters, df_result = analyze_non_letters_vectorized(df_result, text_column='cleaned_text')
print("Vectorized: Non-letter Counts:", non_letter_counts)
print("Vectorized: Words with Non-letters:", words_with_non_letters)

# Batched processing
non_letter_counts, words_with_non_letters = analyze_non_letters_batched(df_result['cleaned_text'], batch_size=100)  # Small batch for example
print("Batched: Non-letter Counts:", non_letter_counts)
print("Batched: Words with Non-letters:", words_with_non_letters)

Vectorized: Non-letter Counts: Counter()
Vectorized: Words with Non-letters: []
Batched: Non-letter Counts: Counter()
Batched: Words with Non-letters: []


In [19]:
# removed all special characters

In [23]:
import pandas as pd
import random

def print_random_texts(df, text_column='text', num_samples=10):
    """
    Prints a specified number of random values from a DataFrame's text column.

    Args:
        df: The pandas DataFrame.
        text_column: The name of the column containing the text.  Defaults to 'text'.
        num_samples: The number of random samples to print. Defaults to 10.

    Raises:
        TypeError: If df is not a pandas DataFrame.
        ValueError: If text_column is not found in df, or if num_samples is invalid.
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame")

    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame")

    if not (isinstance(num_samples, int) and num_samples > 0):
        raise ValueError("num_samples must be a positive integer")

    if num_samples > len(df):
        print(f"Warning: num_samples ({num_samples}) is greater than the DataFrame size ({len(df)}). Printing all texts.")
        samples = df[text_column].tolist() #convert series to list
    else:
        samples = random.sample(df[text_column].tolist(), num_samples) #convert series to list

    print("\nRandom Text Samples:")
    print("-" * 30)
    for i, text in enumerate(samples):
        print(f"Sample {i+1}:\n{text}\n")  # Added newline for better readability
    print("-" * 30)

# Example Usage (Assuming you have a DataFrame named 'df')
print_random_texts(df_result, text_column='cleaned_text', num_samples=10)


Random Text Samples:
------------------------------
Sample 1:
easy website to use good pricing though when i applied a 5 coupon which the site accepted when it came to the final payment it deleted it

Sample 2:
got my order and everything was fine except I had ordered shafts u4 and got 3pw probably not a big deal just troublesome

Sample 3:
I thought the book layout was really great The cooperation and speed with which the project manager responded was very good through the process until it got close to the close out for the process My one criticism sis that when it came time to perform the final steps the process lagged and it kept me from getting my book to market to late for the Christmas season

Sample 4:
The nerve of this company calling  offering me 200 to change my review when theyre holding my fundsTerrible  I would not recommend this place to anyone

Sample 5:
The driver was the best part about the experience She kept me updated But the restaurant was an hour and 20 mins behi

In [25]:
import pandas as pd
from collections import Counter
from tqdm import tqdm

def count_words(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
    """
    Count occurrences of each word across all texts in the DataFrame.
    
    Args:
        df: DataFrame containing the texts
        text_column: Name of column containing text
        
    Returns:
        DataFrame with word counts sorted by frequency
    """
    word_counts = Counter()
    
    # Process each text and count words
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = str(row[text_column]).lower()  # Convert to string and lowercase
        words = text.split()
        word_counts.update(words)
    
    # Convert to DataFrame and sort
    word_count_df = pd.DataFrame({
        'word': list(word_counts.keys()),
        'count': list(word_counts.values())
    })
    
    return word_count_df.sort_values('count', ascending=False).reset_index(drop=True)

# Example usage:

word_counts = count_words(df_result, 'cleaned_text')
word_counts.to_csv('/Users/notagain/data_preprocessing/text_cleaning/original_word_counts.csv', index=False)

100%|██████████| 60875/60875 [00:04<00:00, 12908.77it/s]


In [27]:
df_result.drop(columns=['text'], inplace=True)

In [28]:
df_result

Unnamed: 0,rating,cleaned_text
0,1,Its extremely pricey so I specifically asked i...
1,1,Wrong part ordered at first by the salesman S...
2,1,My order was canceled without explanation Reg...
3,1,The review Cookflightscom is connected with t...
4,1,Put 400 a month away into a bank account dont ...
...,...,...
60870,5,Darmawan our driver was great He even particip...
60871,5,The Notary was very patient and attentive The...
60872,5,Rush hour traffic inhibited initial service bu...
60873,5,Green City Pros delivered exceptional AC repla...


In [29]:

import pandas as pd
import re
from collections import Counter
from tqdm import tqdm
from typing import Dict, Tuple

def replace_rare_words(df: pd.DataFrame, text_column: str = 'cleaned_text', 
                      min_frequency: int = 10) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
    """
    Count word occurrences and replace rare words with 'unknown'.
    
    Args:
        df: Input DataFrame
        text_column: Name of text column
        min_frequency: Minimum frequency threshold (default 10)
    
    Returns:
        tuple: (Modified DataFrame, Word Frequency DataFrame, Results dictionary)
    """
    # Copy DataFrame
    df_modified = df.copy()
    
    # Initialize word counter
    word_counter = Counter()
    
    print("Counting word frequencies...")
    # First pass: count all words
    for text in tqdm(df[text_column].astype(str)):
        # Split into words and count
        words = text.split()
        word_counter.update(words)
    
    # Create sets for quick lookup
    common_words = {word for word, count in word_counter.items() 
                   if count >= min_frequency}
    
    # Create frequency DataFrame
    freq_df = pd.DataFrame({
        'word': list(word_counter.keys()),
        'frequency': list(word_counter.values())
    }).sort_values('frequency', ascending=False)
    
    print("\nReplacing rare words...")
    # Second pass: replace rare words
    modified_texts = []
    replacement_counts = []
    
    for text in tqdm(df[text_column].astype(str)):
        words = text.split()
        replacements = 0
        
        # Replace rare words
        modified_words = [
            word if word in common_words else 'unknown'
            for word in words
        ]
        
        # Count replacements
        replacements = sum(1 for orig, mod in zip(words, modified_words) 
                         if orig != mod)
        
        modified_texts.append(' '.join(modified_words))
        replacement_counts.append(replacements)
    
    # Update DataFrame
    df_modified[text_column] = modified_texts
    
    # Compile results
    results = {
        'total_words_processed': sum(word_counter.values()),
        'unique_words': len(word_counter),
        'words_above_threshold': len(common_words),
        'words_replaced': len(word_counter) - len(common_words),
        'total_replacements': sum(replacement_counts),
        'avg_replacements_per_row': sum(replacement_counts) / len(df)
    }
    
    return df_modified, freq_df, results

def print_word_analysis(results: Dict, freq_df: pd.DataFrame):
    """Print analysis of word replacement"""
    print("\nWord Frequency Analysis:")
    print("-" * 60)
    print(f"Total words processed: {results['total_words_processed']}")
    print(f"Unique words found: {results['unique_words']}")
    print(f"Words above threshold: {results['words_above_threshold']}")
    print(f"Words replaced with 'unknown': {results['words_replaced']}")
    print(f"Total replacements made: {results['total_replacements']}")
    print(f"Average replacements per row: {results['avg_replacements_per_row']:.2f}")
    
    print("\nTop 20 most frequent words:")
    print("-" * 60)
    print(freq_df.head(20).to_string(index=False))
    
    print("\nLeast frequent words (sample of 20):")
    print("-" * 60)
    print(freq_df.tail(20).to_string(index=False))

modified, freq_df, results = replace_rare_words(df_result, min_frequency=10)

# Print analysis
print_word_analysis(results, freq_df)

modified.to_csv('/Users/notagain/data_preprocessing/text_cleaning/modified_text.csv', index=False)

Counting word frequencies...


100%|██████████| 60875/60875 [00:00<00:00, 65528.94it/s]



Replacing rare words...


100%|██████████| 60875/60875 [00:00<00:00, 65956.13it/s]



Word Frequency Analysis:
------------------------------------------------------------
Total words processed: 3592445
Unique words found: 81901
Words above threshold: 12054
Words replaced with 'unknown': 69847
Total replacements made: 134651
Average replacements per row: 2.21

Top 20 most frequent words:
------------------------------------------------------------
word  frequency
 the     146383
   I     113282
  to     112265
 and     105064
   a      77186
 was      63123
  of      47932
 for      43438
  my      41894
  it      39776
that      35716
  is      34896
  in      34196
with      32401
 not      31368
have      28242
  on      26992
they      24296
 but      22184
  me      21693

Least frequent words (sample of 20):
------------------------------------------------------------
          word  frequency
           sas          1
            hé          1
        Washer          1
          Mies          1
brokersbankers          1
       mortage          1
       Bougerv  