In [4]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
df = pd.read_csv('/Users/notagain/data_preprocessing/so_many_rev.csv')
#original = pd.read_csv('/Users/notagain/data_preprocessing/text_cleaning/original.csv')
#from second_try import ContractionExpander
#expander = ContractionExpander()
#df_expanded = expander.expand_dataframe(df, 'text')

import pandas as pd
from collections import Counter
from tqdm import tqdm

def count_words(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
    """
    Count occurrences of each word across all texts in the DataFrame.
    
    Args:
        df: DataFrame containing the texts
        text_column: Name of column containing text
        
    Returns:
        DataFrame with word counts sorted by frequency
    """
    word_counts = Counter()
    
    # Process each text and count words
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = str(row[text_column]).lower()  # Convert to string and lowercase
        words = text.split()
        word_counts.update(words)
    
    # Convert to DataFrame and sort
    word_count_df = pd.DataFrame({
        'word': list(word_counts.keys()),
        'count': list(word_counts.values())
    })
    
    return word_count_df.sort_values('count', ascending=False).reset_index(drop=True)

# Example usage:

#word_counts = count_words(df, 'text')
#word_counts.to_csv('/Users/notagain/data_preprocessing/text_cleaning/original_word_counts.csv', index=False)

In [10]:
import re
from collections import Counter

def analyze_all_contractions(df, text_column='text'):
    """
    Analyze all contractions before and after expansion in a DataFrame.
    
    Args:
        df: Input DataFrame
        text_column: Name of the text column to analyze
    
    Returns:
        tuple: (Results dictionary, expanded DataFrame)
    """
    # Make a copy of original DataFrame
    df_expanded = df.copy()
    
    # Define contractions with their patterns and expansions
    # Added the standard ASCII apostrophe (U+0027) explicitly
    contractions = {
        "its": (r"\b(?<!th)its\b(?!\s+own\b)", "it is"),  # added pattern for 'its', excluding 'this' and 'its own'
        "it's": (r"it[''\u2019]s", "it is"),
        "that's": (r"that[''\u2019]s", "that is"),
        "there's": (r"there[''\u2019]s", "there is"),
        "he's": (r"he[''\u2019]s", "he is"),
        "she's": (r"she[''\u2019]s", "she is"),
        "what's": (r"what[''\u2019]s", "what is"),
        "i'm": (r"i[''\u2019]m", "i am"),
        "i've": (r"i[''\u2019]ve", "i have"),
        "i'll": (r"i[''\u2019]ll", "i will"),
        "i'd": (r"i[''\u2019]d", "i would"),
        "don't": (r"don[''\u2019]t", "do not"),
        "doesn't": (r"doesn[''\u2019]t", "does not"),
        "didn't": (r"didn[''\u2019]t", "did not"),
        "won't": (r"won[''\u2019]t", "will not"),
        "can't": (r"can[''\u2019]t", "cannot"),
        "couldn't": (r"couldn[''\u2019]t", "could not"),
        "shouldn't": (r"shouldn[''\u2019]t", "should not"),
        "wouldn't": (r"wouldn[''\u2019]t", "would not"),
        "they're": (r"they[''\u2019]re", "they are"),
        "they've": (r"they[''\u2019]ve", "they have"),
        "they'll": (r"they[''\u2019]ll", "they will"),
        "we're": (r"we[''\u2019]re", "we are"),
        "we've": (r"we[''\u2019]ve", "we have"),
        "we'll": (r"we[''\u2019]ll", "we will"),
        "you're": (r"you[''\u2019]re", "you are"),
        "you've": (r"you[''\u2019]ve", "you have"),
        "you'll": (r"you[''\u2019]ll", "you will"),
        "let's": (r"let[''\u2019]s", "let us")
    }
    
    # Rest of the function remains the same
    results = {
        'contractions': {},
        'word_counts': {
            'original': 0,
            'expanded': 0
        }
    }
    
    # Count original contractions and words
    for text in df[text_column]:
        text = str(text)
        results['word_counts']['original'] += len(text.split())
        
        # Count each contraction type
        for contraction, (pattern, _) in contractions.items():
            count = len(re.findall(pattern, text, re.IGNORECASE))
            if contraction not in results['contractions']:
                results['contractions'][contraction] = {'original': 0, 'expanded': 0}
            results['contractions'][contraction]['original'] += count
    
    # Expand contractions
    def expand_text(text):
        text = str(text)
        for _, (pattern, expansion) in contractions.items():
            text = re.sub(pattern, expansion, text, flags=re.IGNORECASE)
        return text
    
    # Apply expansion
    df_expanded[text_column] = df_expanded[text_column].apply(expand_text)
    
    # Count remaining contractions and expanded words
    for text in df_expanded[text_column]:
        text = str(text)
        results['word_counts']['expanded'] += len(text.split())
        
        # Count remaining contractions
        for contraction, (pattern, _) in contractions.items():
            count = len(re.findall(pattern, text, re.IGNORECASE))
            results['contractions'][contraction]['expanded'] += count
    
    # Calculate success rate for each contraction
    for contraction in results['contractions']:
        original = results['contractions'][contraction]['original']
        expanded = results['contractions'][contraction]['expanded']
        results['contractions'][contraction]['success_rate'] = \
            100 * (original - expanded) / original if original > 0 else 100
    
    return results, df_expanded

def print_analysis(results):
    """Print the analysis results in a readable format"""
    print("\nContraction Analysis:")
    print("-" * 60)
    print(f"{'Contraction':<15} {'Original':<10} {'Remaining':<10} {'Success Rate':<10}")
    print("-" * 60)
    
    for contraction, counts in results['contractions'].items():
        if counts['original'] > 0:  # Only show contractions that were present
            print(f"{contraction:<15} {counts['original']:<10} {counts['expanded']:<10} {counts['success_rate']:.1f}%")
    
    print("\nWord Counts:")
    print("-" * 60)
    print(f"Original text: {results['word_counts']['original']}")
    print(f"Expanded text: {results['word_counts']['expanded']}")
    print(f"Difference: {results['word_counts']['expanded'] - results['word_counts']['original']}")


results, expanded_df = analyze_all_contractions(df)
print_analysis(results)


Contraction Analysis:
------------------------------------------------------------
Contraction     Original   Remaining  Success Rate
------------------------------------------------------------
its             1403       0          100.0%
it's            5639       0          100.0%
that's          1270       0          100.0%
there's         515        0          100.0%
he's            275        0          100.0%
she's           103        0          100.0%
what's          227        0          100.0%
i'm             5366       0          100.0%
i've            3453       0          100.0%
i'll            916        0          100.0%
i'd             859        0          100.0%
don't           5573       0          100.0%
doesn't         1548       0          100.0%
didn't          5578       0          100.0%
won't           1184       0          100.0%
can't           2328       0          100.0%
couldn't        1405       0          100.0%
shouldn't       210        0          1

In [7]:
exp_count = count_words(expanded_df, 'text')
exp_count.to_csv('/Users/notagain/data_preprocessing/text_cleaning/exp_count.csv', index=False)

100%|██████████| 60875/60875 [00:05<00:00, 11757.52it/s]


In [12]:
import re
import pandas as pd
from datetime import datetime

def analyze_number_replacement(df, text_column='text'):
    """
    Replace numbers with 'number' in text and analyze word counts before and after.
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        text_column (str): Name of text column to analyze
    
    Returns:
        tuple: (Modified DataFrame, Word count analysis DataFrame, Analysis results dict)
    """
    # Copy the input DataFrame
    df_modified = df.copy()
    
    # Initialize word count storage
    original_word_counts = []
    modified_word_counts = []
    total_numbers_replaced = 0
    
    # Regular expression for numbers (positive/negative, integers/decimals)
    number_pattern = r'-?\d*\.?\d+'
    
    # Process each text entry
    for text in df[text_column]:
        # Convert to string and get original word count
        original_text = str(text)
        original_word_counts.append(len(original_text.split()))
        
        # Count numbers in original text
        numbers_found = len(re.findall(number_pattern, original_text))
        total_numbers_replaced += numbers_found
        
        # Replace numbers and get new word count
        modified_text = re.sub(number_pattern, 'number', original_text)
        modified_word_counts.append(len(modified_text.split()))
    
    # Create modified DataFrame
    df_modified[text_column] = df[text_column].apply(
        lambda x: re.sub(number_pattern, 'number', str(x))
    )
    
    # Create analysis DataFrame
    analysis_df = pd.DataFrame({
        'original_text': df[text_column],
        'modified_text': df_modified[text_column],
        'original_word_count': original_word_counts,
        'modified_word_count': modified_word_counts,
        'word_count_difference': [m - o for m, o in zip(modified_word_counts, original_word_counts)]
    })
    
    # Create results dictionary for printing
    results = {
        'word_counts': {
            'original': sum(original_word_counts),
            'modified': sum(modified_word_counts)
        },
        'number_stats': {
            'total_numbers': total_numbers_replaced,
            'total_rows': len(df),
            'avg_numbers_per_row': total_numbers_replaced / len(df) if len(df) > 0 else 0
        }
    }
    
    # Save analysis to CSV
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'number_word_analysis_{timestamp}.csv'
    analysis_df.to_csv(filename, index=False)
    print(f"Analysis saved to: {filename}")
    
    return df_modified, analysis_df, results

def print_analysis(results):
    """Print the number replacement analysis results in a readable format"""
    print("\nNumber Replacement Analysis:")
    print("-" * 60)
    print(f"{'Metric':<25} {'Value':<15}")
    print("-" * 60)
    
    # Number statistics
    print(f"{'Total numbers replaced':<25} {results['number_stats']['total_numbers']:<15}")
    print(f"{'Total rows processed':<25} {results['number_stats']['total_rows']:<15}")
    print(f"{'Avg numbers per row':<25} {results['number_stats']['avg_numbers_per_row']:.2f}")
    
    print("\nWord Counts:")
    print("-" * 60)
    print(f"Original text total: {results['word_counts']['original']}")
    print(f"Modified text total: {results['word_counts']['modified']}")
    print(f"Difference: {results['word_counts']['modified'] - results['word_counts']['original']}")


modified_df_1, analysis, results = analyze_number_replacement(expanded_df)

num_count = count_words(modified_df_1, 'text')
num_count.to_csv('/Users/notagain/data_preprocessing/text_cleaning/number_count.csv', index=False)


Analysis saved to: number_word_analysis_20250205_085313.csv


100%|██████████| 60875/60875 [00:05<00:00, 11684.83it/s]


In [13]:
import re
import pandas as pd
import numpy as np
from datetime import datetime
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Tuple
import multiprocessing
from tqdm import tqdm
import joblib
import os

class TextPatternReplacer:
    """A class to efficiently handle text pattern replacement and analysis"""
    
    def __init__(self, cache_dir: str = '.cache'):
        """Initialize the replacer with compiled patterns and cache"""
        # Set up caching directory
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)
        
        # Compile patterns once during initialization
        self.patterns = self._compile_patterns()
        
        # Initialize result cache
        self.result_cache = {}
        
        # Determine optimal chunk size based on CPU count
        self.cpu_count = multiprocessing.cpu_count()
        
    def _compile_patterns(self) -> Dict[str, re.Pattern]:
        """Compile regex patterns for better performance"""
        pattern_strings = {
            'numbers': r'-?\d*\.?\d+',
            'ordinals': r'\d+(?:st|nd|rd|th)',
            'written_numbers': r'zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion',
            'fractions': r'dozen|couple|pair|half|quarter|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth',
            'times': r'\d{1,2}:\d{2}(?::\d{2})?(?:\s*[AaPp][Mm])?',
            'am_pm': r'\d{1,2}\s*[AaPp][Mm]',
            'time_words': r'noon|midnight|morning|afternoon|evening|night',
            'time_modifiers': r'(?:early|late|mid|around|about|before|after)\s+(?:morning|afternoon|evening|night)',
            'time_units': r'(?:milli)?seconds?|minutes?|hours?|days?|weeks?|months?|years?|decades?|centuries?',
            'time_periods': r'fortnight|semester|quarter|annual|biannual|biennial',
            'frequencies': r'daily|weekly|monthly|yearly|hourly',
            'dates': r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}|\d{4}-\d{2}-\d{2}',
            'months': r'(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)',
            'days': r'(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri|Sat|Sun)',
            'seasons': r'(?:Spring|Summer|Fall|Autumn|Winter)',
            'relative_time': r'(?:last|next|this)\s+(?:week|month|year|decade|century)',
            'relative_days': r'(?:yesterday|today|tomorrow)',
            'time_distance': r'(?:ago|from now)',
            'decades': r'(?:19|20)\d0s|(?:nineteen|twenty)-(?:(?:twen|thir|for|fif|six|seven|eigh|nine)ties)',
            'measurements': r'\d+\s*(?:km|m|cm|mm|mi|ft|in|kg|g|mg|lb|oz)',
            'measurement_words': r'(?:kilo|centi|milli)?(?:meters?|grams?|litres?|liters?)',
            'currency': r'\$\d+(?:\.\d{2})?|\d+\s*(?:dollars?|cents?|euros?|pounds?)',
            'percentages': r'\d+(?:\.\d+)?%|\d+\s+percent'
        }
        
        return {name: re.compile(pattern, re.IGNORECASE) for name, pattern in pattern_strings.items()}
    
    @lru_cache(maxsize=10000)
    def _process_text(self, text: str) -> Tuple[str, int]:
        """Process a single text string with caching"""
        text = str(text)
        replacements = 0
        
        # Convert text to hashable form for caching
        cache_key = hash(text)
        if cache_key in self.result_cache:
            return self.result_cache[cache_key]
        
        modified_text = text
        for pattern in self.patterns.values():
            matches = pattern.findall(modified_text)
            replacements += len(matches)
            modified_text = pattern.sub('number', modified_text)
        
        result = (modified_text, replacements)
        self.result_cache[cache_key] = result
        return result
    
    def _process_batch(self, texts: List[str]) -> List[Tuple[str, int, int]]:
        """Process a batch of texts and return original word counts"""
        return [(self._process_text(text)[0],  # modified text
                self._process_text(text)[1],  # replacement count
                len(str(text).split()))  # original word count
               for text in texts]
    
    def analyze_texts(self, df: pd.DataFrame, text_column: str = 'text', 
                     batch_size: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
        """
        Analyze texts with batch processing and parallel execution
        """
        # Determine optimal batch size if not provided
        if batch_size is None:
            batch_size = max(100, len(df) // (self.cpu_count * 4))
        
        # Initialize storage
        modified_texts = []
        replacement_counts = []
        original_word_counts = []
        modified_word_counts = []
        
        # Process in batches with progress bar
        texts = df[text_column].tolist()
        batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
        
        with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
            results = list(tqdm(
                executor.map(self._process_batch, batches),
                total=len(batches),
                desc="Processing text batches"
            ))
        
        # Flatten results
        for batch_result in results:
            for modified_text, replacements, orig_word_count in batch_result:
                modified_texts.append(modified_text)
                replacement_counts.append(replacements)
                original_word_counts.append(orig_word_count)
                modified_word_counts.append(len(modified_text.split()))
        
        # Create modified DataFrame
        df_modified = df.copy()
        df_modified[text_column] = modified_texts
        
        # Create analysis DataFrame
        analysis_df = pd.DataFrame({
            'original_text': df[text_column],
            'modified_text': modified_texts,
            'replacements': replacement_counts,
            'original_word_count': original_word_counts,
            'modified_word_count': modified_word_counts,
            'word_count_difference': np.subtract(modified_word_counts, original_word_counts)
        })
        
        # Calculate results
        results = {
            'word_counts': {
                'original': sum(original_word_counts),
                'modified': sum(modified_word_counts)
            },
            'replacement_stats': {
                'total_replacements': sum(replacement_counts),
                'total_rows': len(df),
                'avg_replacements_per_row': sum(replacement_counts) / len(df) if len(df) > 0 else 0
            }
        }
        
        # Cache results to disk
        cache_file = os.path.join(self.cache_dir, f'analysis_cache_{datetime.now().strftime("%Y%m%d_%H%M%S")}.joblib')
        joblib.dump((df_modified, analysis_df, results), cache_file)
        
        return df_modified, analysis_df, results
    
    def clear_cache(self):
        """Clear both memory and disk caches"""
        self.result_cache.clear()
        self._process_text.cache_clear()
        for file in os.listdir(self.cache_dir):
            if file.startswith('analysis_cache_'):
                os.remove(os.path.join(self.cache_dir, file))

def print_analysis(results: Dict):
    """Print the analysis results in a readable format"""
    print("\nText Replacement Analysis:")
    print("-" * 60)
    print(f"{'Metric':<25} {'Value':<15}")
    print("-" * 60)
    
    print(f"{'Total replacements':<25} {results['replacement_stats']['total_replacements']:<15}")
    print(f"{'Total rows processed':<25} {results['replacement_stats']['total_rows']:<15}")
    print(f"{'Avg replacements/row':<25} {results['replacement_stats']['avg_replacements_per_row']:.2f}")
    
    print("\nWord Counts:")
    print("-" * 60)
    print(f"Original text total: {results['word_counts']['original']}")
    print(f"Modified text total: {results['word_counts']['modified']}")
    print(f"Difference: {results['word_counts']['modified'] - results['word_counts']['original']}")


#expanded = pd.read_csv('/Users/notagain/data_preprocessing/text_cleaning/expanded.csv')
replacer = TextPatternReplacer()
modified_df_2, analysis, results = replacer.analyze_texts(modified_df_1)
print_analysis(results)


Processing text batches: 100%|██████████| 33/33 [01:45<00:00,  3.19s/it]



Text Replacement Analysis:
------------------------------------------------------------
Metric                    Value          
------------------------------------------------------------
Total replacements        119249         
Total rows processed      60875          
Avg replacements/row      1.96

Word Counts:
------------------------------------------------------------
Original text total: 3643949
Modified text total: 3643892
Difference: -57


In [14]:
word_counts = count_words(modified_df_2, 'text')
word_counts.to_csv('/Users/notagain/data_preprocessing/text_cleaning/semi_word_counts.csv', index=False)

100%|██████████| 60875/60875 [00:05<00:00, 11909.52it/s]


In [15]:
!pwd

/Users/notagain/data_preprocessing/text_cleaning


In [16]:
import re
import pandas as pd
from collections import Counter

def process_hyphens(df, text_column='text'):
    """
    Process hyphens in text:
    1. Split hyphenated words (except e-mail → email)
    2. Remove standalone hyphens with spaces around them
    
    Args:
        df: DataFrame with text to analyze
        text_column: Name of the text column
        
    Returns:
        tuple: (Modified DataFrame, Analysis DataFrame, Results dictionary)
    """
    # Copy DataFrame to avoid modifying original
    df_modified = df.copy()
    
    # Initialize storage for analysis
    hyphenated_words = []
    standalone_hyphens = []
    
    # Compile regex patterns
    email_pattern = re.compile(r'\be-mails?\b', re.IGNORECASE)
    hyphenated_pattern = re.compile(r'\b\w+[-]\w+\b')
    standalone_pattern = re.compile(r'\s+-\s+')  # matches " - "
    
    def process_text(text):
        """Process individual text, handling both hyphenated words and standalone hyphens"""
        text = str(text)
        
        # Count standalone hyphens before removing them
        standalone_count = len(standalone_pattern.findall(text))
        
        # Handle e-mail special case
        text = email_pattern.sub(lambda m: 
            'email' if m.group().lower() == 'e-mail' else 'emails', 
            text
        )
        
        # Find hyphenated words before processing
        found_words = hyphenated_pattern.findall(text)
        
        # Replace hyphenated words with space-separated words
        for word in found_words:
            if not email_pattern.match(word):  # Skip if it's an e-mail
                split_version = word.replace('-', ' ')
                text = text.replace(word, split_version)
        
        # Remove standalone hyphens with spaces
        text = standalone_pattern.sub(' ', text)
        
        return text, found_words, standalone_count
    
    # Process each row
    modified_texts = []
    for text in df[text_column]:
        modified_text, found_words, standalone_count = process_text(text)
        modified_texts.append(modified_text)
        hyphenated_words.extend(found_words)
        standalone_hyphens.append(standalone_count)
    
    # Update modified DataFrame
    df_modified[text_column] = modified_texts
    
    # Create analysis DataFrame
    word_counts = Counter(hyphenated_words)
    analysis_df = pd.DataFrame({
        'original_word': list(word_counts.keys()),
        'split_into': [' '.join(word.split('-')) for word in word_counts.keys()],
        'count': list(word_counts.values())
    }).sort_values('count', ascending=False)
    
    # Calculate results
    results = {
        'total_hyphenated_found': len(hyphenated_words),
        'unique_hyphenated': len(word_counts),
        'total_standalone_hyphens': sum(standalone_hyphens),
        'email_conversions': sum(1 for text in df[text_column] 
                               if re.search(email_pattern, str(text))),
        'most_common': list(word_counts.most_common(5))
    }
    
    return df_modified, analysis_df, results

def print_analysis(results, analysis_df):
    """Print analysis of hyphen processing"""
    print("\nHyphen Processing Analysis:")
    print("-" * 60)
    print(f"Total hyphenated words found: {results['total_hyphenated_found']}")
    print(f"Unique hyphenated words: {results['unique_hyphenated']}")
    print(f"Standalone hyphens removed: {results['total_standalone_hyphens']}")
    print(f"E-mail conversions performed: {results['email_conversions']}")
    
    if len(analysis_df) > 0:
        print("\nMost common hyphenated words and their splits:")
        print("-" * 60)
        print(f"{'Original':<20} {'Split Into':<25} {'Count':<10}")
        print("-" * 60)
        for _, row in analysis_df.head().iterrows():
            print(f"{row['original_word']:<20} {row['split_into']:<25} {row['count']:<10}")

In [18]:
modified_df_no_hyp, analysis, results = process_hyphens(modified_df_2)

# Print analysis
print_analysis(results, analysis)

modified_df_3 = count_words(modified_df_no_hyp, 'text')
modified_df_3.to_csv('/Users/notagain/data_preprocessing/text_cleaning/unhyphen.csv', index=False)



Hyphen Processing Analysis:
------------------------------------------------------------
Total hyphenated words found: 6723
Unique hyphenated words: 3066
Standalone hyphens removed: 3228
E-mail conversions performed: 188

Most common hyphenated words and their splits:
------------------------------------------------------------
Original             Split Into                Count     
------------------------------------------------------------
number-number        number number             316       
follow-up            follow up                 132       
number-star          number star               105       
pick-up              pick up                   81        
on-line              on line                   71        


100%|██████████| 60875/60875 [00:05<00:00, 11297.27it/s]


In [22]:
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
from typing import List, Dict, Tuple
from tqdm import tqdm

class CharacterRemover:
    """Efficient character removal processor with batching and analysis"""
    
    def __init__(self):
        # Define all character variants to remove
        self.chars_to_remove = {
            'hyphen': [
                '\u002D',  # hyphen-minus
                '\u2010',  # hyphen
                '\u2011',  # non-breaking hyphen
                '\u2012',  # figure dash
                '\u2013',  # en dash
                '\u2014',  # em dash
                '\u2015'   # horizontal bar
            ],
            'comma': [
                '\u002C',  # standard comma
                '\u201A'   # single low-9 quotation mark (sometimes used as comma)
            ],
            'quotes': [
                '\u0022',  # standard double quote
                '\u201C',  # left double quote
                '\u201D',  # right double quote
                '\u201E',  # double low-9 quote
                '\u201F',  # double high-reversed-9 quote
                '\u2033',  # double prime
                '\u2034',  # triple prime
                '\u2057'   # quadruple prime
            ]
        }
        
        # Create regex pattern for all characters
        all_chars = ''.join([char for sublist in self.chars_to_remove.values() 
                           for char in sublist])
        self.remove_pattern = re.compile(f'[{re.escape(all_chars)}]')
        
        # Set optimal batch size based on CPU count
        self.cpu_count = multiprocessing.cpu_count()
        self.default_batch_size = 1000
    
    def process_batch(self, texts: List[str]) -> List[Tuple[str, Dict]]:
        """Process a batch of texts"""
        results = []
        for text in texts:
            # Count original occurrences
            counts = {
                char_type: sum(text.count(char) for char in chars)
                for char_type, chars in self.chars_to_remove.items()
            }
            
            # Remove characters
            cleaned_text = self.remove_pattern.sub('', text)
            
            results.append((cleaned_text, counts))
        return results

    def remove_characters(self, df: pd.DataFrame, text_column: str = 'text', 
                         batch_size: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
        """
        Remove specified characters from text with batch processing.
        
        Args:
            df: Input DataFrame
            text_column: Name of text column
            batch_size: Optional custom batch size
            
        Returns:
            tuple: (Modified DataFrame, Analysis DataFrame, Results dictionary)
        """
        # Copy DataFrame
        df_modified = df.copy()
        
        # Set batch size
        batch_size = batch_size or self.default_batch_size
        
        # Prepare batches
        texts = df[text_column].astype(str).tolist()
        batches = [texts[i:i + batch_size] 
                  for i in range(0, len(texts), batch_size)]
        
        # Process batches in parallel
        all_results = []
        total_counts = {char_type: 0 for char_type in self.chars_to_remove.keys()}
        
        with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
            futures = [executor.submit(self.process_batch, batch) 
                      for batch in batches]
            
            # Process results with progress bar
            for future in tqdm(futures, desc="Processing batches"):
                batch_results = future.result()
                all_results.extend(batch_results)
        
        # Unzip results
        cleaned_texts, char_counts = zip(*all_results)
        
        # Update DataFrame
        df_modified[text_column] = cleaned_texts
        
        # Calculate total counts
        for counts in char_counts:
            for char_type, count in counts.items():
                total_counts[char_type] += count
        
        # Create analysis DataFrame
        analysis_df = pd.DataFrame({
            'original_text': df[text_column],
            'cleaned_text': cleaned_texts,
            'chars_removed': [len(orig) - len(clean) 
                            for orig, clean in zip(df[text_column], cleaned_texts)]
        })
        
        # Compile results
        results = {
            'total_rows_processed': len(df),
            'total_chars_removed': sum(analysis_df['chars_removed']),
            'character_counts': total_counts,
            'avg_chars_removed': sum(analysis_df['chars_removed']) / len(df)
        }
        
        return df_modified, analysis_df, results

def print_removal_analysis(results: Dict):
    """Print analysis of character removal"""
    print("\nCharacter Removal Analysis:")
    print("-" * 60)
    print(f"Total rows processed: {results['total_rows_processed']}")
    print(f"Total characters removed: {results['total_chars_removed']}")
    print(f"Average characters removed per row: {results['avg_chars_removed']:.2f}")
    
    print("\nCharacters removed by type:")
    print("-" * 60)
    for char_type, count in results['character_counts'].items():
        print(f"{char_type.title():<15} {count:>10}")

remover = CharacterRemover()
modified_df_4, analysis, results = remover.remove_characters(modified_df_no_hyp)

# Print analysis
print_removal_analysis(results)

Processing batches: 100%|██████████| 61/61 [00:00<00:00, 36813.32it/s]


Character Removal Analysis:
------------------------------------------------------------
Total rows processed: 60875
Total characters removed: 109717
Average characters removed per row: 1.80

Characters removed by type:
------------------------------------------------------------
Hyphen                3662
Comma                94692
Quotes               11363





In [21]:


count_modified_df_4 = count_words(modified_df_4, 'text')

count_modified_df_4.to_csv('/Users/notagain/data_preprocessing/text_cleaning/modified_df_4.csv', index=False)

100%|██████████| 60875/60875 [00:05<00:00, 11258.21it/s]


In [117]:
import re
import pandas as pd
from collections import Counter

def clean_text(df, text_column='text'):
    """
    Clean text with guaranteed spaces around punctuation.
    
    Args:
        df: DataFrame with text to analyze
        text_column: Name of text column to process
    
    Returns:
        tuple: (Modified DataFrame, Analysis DataFrame, Results dictionary)
    """
    # Copy DataFrame to avoid modifying original
    df_modified = df.copy()
    
    # Initialize counters for analysis
    replacements = {
        'periods': 0,
        'quotes': 0,
        'ampersands': 0,
        'commas': 0,
        'currency': 0,
        'parentheses': 0,
        'exclamations': 0,
        'extra_spaces': 0,
        'slashes': 0
    }
    
    def process_text(text):
        """Process individual text with all cleaning rules"""
        text = str(text)
        original_length = len(text)
        
        # Store original counts
        replacements['quotes'] += text.count('"') + text.count("'") + text.count('"') + text.count('"')
        replacements['ampersands'] += text.count('&')
        replacements['commas'] += text.count(',')
        replacements['periods'] += text.count('.')
        replacements['parentheses'] += text.count('(') + text.count(')') + text.count('[') + text.count(']')
        replacements['exclamations'] += text.count('!')
        replacements['slashes'] += text.count('/')
        
        # 1. Remove ALL quotation marks
        text = re.sub(r'[""\'"]', '', text)
        
        # 2. Handle number/number pattern
        text = re.sub(r'\b\d+/\d+\b', 'number', text)
        
        # 3. Handle ellipsis first (preserve it)
        text = re.sub(r'\.{3}', 'ELLIPSIS_PLACEHOLDER', text)
        
        # 4. Guarantee space before and after every comma
        text = re.sub(r'\s*,\s*', ' , ', text)
        
        # 5. Guarantee space before and after every period (except in numbers)
        text = re.sub(r'(?<!\d)\s*\.\s*(?!\d)', ' . ', text)
        
        # 6. Replace & with 'and'
        text = re.sub(r'&', 'and', text)
        
        # 7. Remove currency symbols
        text = re.sub(r'[$€£¥¢]', '', text)
        
        # 8. Remove parentheses
        text = re.sub(r'[\(\)\[\]]', '', text)
        
        # 9. Add space before exclamation points
        text = re.sub(r'(?<=[^\s])!', ' !', text)
        
        # 10. Restore ellipsis
        text = text.replace('ELLIPSIS_PLACEHOLDER', '...')
        
        # 11. Clean up extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Count removed extra spaces
        replacements['extra_spaces'] += original_length - len(text)
        
        return text
    
    # Process each row
    modified_texts = []
    for text in df[text_column]:
        modified_text = process_text(text)
        modified_texts.append(modified_text)
    
    # Update modified DataFrame
    df_modified[text_column] = modified_texts
    
    # Create analysis DataFrame
    analysis_df = pd.DataFrame({
        'original_text': df[text_column],
        'cleaned_text': modified_texts,
        'chars_removed': [len(str(orig)) - len(clean) 
                         for orig, clean in zip(df[text_column], modified_texts)]
    })
    
    # Calculate results
    results = {
        'total_rows_processed': len(df),
        'total_chars_removed': sum(analysis_df['chars_removed']),
        'replacements': replacements
    }
    
    return df_modified, analysis_df, results

def print_cleaning_analysis(results):
    """Print analysis of text cleaning operations"""
    print("\nText Cleaning Analysis:")
    print("-" * 60)
    print(f"Total rows processed: {results['total_rows_processed']}")
    print(f"Total characters removed: {results['total_chars_removed']}")
    
    print("\nReplacements by type:")
    print("-" * 60)
    for key, value in results['replacements'].items():
        print(f"{key.replace('_', ' ').title():<20} {value:>10}")





In [23]:
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
from typing import List, Dict, Tuple
from tqdm import tqdm

class TextProcessor:
    def __init__(self):
        # Define patterns
        self.patterns = {
            'currency': r'\$\d*\.?\d+|\$',  # Matches $, $50, $50.00
            'ampersand': r'\s*&\s*',  # & with optional spaces
            'punctuation': r'\s*([,.!?:;])\s*',  # Common punctuation
            'number_patterns': [
                r'\b\d+/\d+\b',  # number/number
                r'\b\w*number\w*\b',  # words containing number
                r'\b\w*\d+\w*\b',  # words with digits
                r'\bnumbernumber\b',  # explicit numbernumber
                r'\bsho\d+\b'  # shonumber
            ]
        }
        
        # Compile regex patterns for efficiency
        self.currency_pattern = re.compile(self.patterns['currency'])
        self.ampersand_pattern = re.compile(self.patterns['ampersand'])
        self.punct_pattern = re.compile(self.patterns['punctuation'])
        self.number_patterns = [re.compile(pattern) for pattern in self.patterns['number_patterns']]
        
        # Set optimal batch size based on CPU count
        self.cpu_count = multiprocessing.cpu_count()
        self.default_batch_size = 1000

    def process_text(self, text: str) -> Tuple[str, Dict]:
        """Process individual text with all rules"""
        text = str(text)
        replacements = {
            'currency': 0,
            'ampersand': 0,
            'punctuation': 0,
            'number_patterns': 0
        }

        # Step 1: Count and remove currency symbols
        currency_matches = len(self.currency_pattern.findall(text))
        text = self.currency_pattern.sub('', text)
        replacements['currency'] = currency_matches

        # Step 2: Replace & with 'and'
        ampersand_matches = len(self.ampersand_pattern.findall(text))
        text = self.ampersand_pattern.sub(' and ', text)
        replacements['ampersand'] = ampersand_matches

        # Step 3: Process all number patterns
        for pattern in self.number_patterns:
            matches = pattern.findall(text)
            replacements['number_patterns'] += len(matches)
            text = pattern.sub('number', text)

        # Step 4: Add spaces around punctuation
        punct_matches = len(self.punct_pattern.findall(text))
        text = self.punct_pattern.sub(r' \1 ', text)
        replacements['punctuation'] = punct_matches

        # Step 5: Clean up extra whitespace
        text = ' '.join(text.split())

        return text, replacements

    def process_batch(self, texts: List[str]) -> List[Tuple[str, Dict]]:
        """Process a batch of texts"""
        return [self.process_text(text) for text in texts]

    def process_dataframe(self, df: pd.DataFrame, text_column: str = 'text', 
                         batch_size: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
        """
        Process text with batch processing and analysis.
        
        Args:
            df: Input DataFrame
            text_column: Name of text column
            batch_size: Optional custom batch size
        
        Returns:
            tuple: (Modified DataFrame, Analysis DataFrame, Results dictionary)
        """
        df_modified = df.copy()
        batch_size = batch_size or self.default_batch_size

        # Prepare batches
        texts = df[text_column].astype(str).tolist()
        batches = [texts[i:i + batch_size] 
                  for i in range(0, len(texts), batch_size)]

        # Process batches in parallel
        all_results = []
        total_replacements = {
            'currency': 0,
            'ampersand': 0,
            'punctuation': 0,
            'number_patterns': 0
        }

        with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
            futures = [executor.submit(self.process_batch, batch) 
                      for batch in batches]
            
            # Process results with progress bar
            for future in tqdm(futures, desc="Processing text batches"):
                batch_results = future.result()
                all_results.extend(batch_results)

        # Unzip results
        processed_texts, replacements_list = zip(*all_results)

        # Update DataFrame
        df_modified[text_column] = processed_texts

        # Sum up replacements
        for rep in replacements_list:
            for key in total_replacements:
                total_replacements[key] += rep[key]

        # Create analysis DataFrame
        analysis_df = pd.DataFrame({
            'original_text': df[text_column],
            'processed_text': processed_texts,
            'chars_difference': [len(str(orig)) - len(proc) 
                               for orig, proc in zip(df[text_column], processed_texts)]
        })

        # Compile results
        results = {
            'total_rows_processed': len(df),
            'total_chars_difference': sum(analysis_df['chars_difference']),
            'replacements': total_replacements,
            'avg_chars_difference': sum(analysis_df['chars_difference']) / len(df)
        }

        return df_modified, analysis_df, results

def print_processing_analysis(results: Dict):
    """Print analysis of text processing"""
    print("\nText Processing Analysis:")
    print("-" * 60)
    print(f"Total rows processed: {results['total_rows_processed']}")
    print(f"Total character difference: {results['total_chars_difference']}")
    print(f"Average character difference per row: {results['avg_chars_difference']:.2f}")
    
    print("\nReplacements by type:")
    print("-" * 60)
    for replacement_type, count in results['replacements'].items():
        print(f"{replacement_type.replace('_', ' ').title():<20} {count:>10}")

processor = TextProcessor()
modified_df_5, analysis, results = processor.process_dataframe(modified_df_4)

# Print analysis
print_processing_analysis(results)




count_modified_df_5 = count_words(modified_df_5, 'text')

count_modified_df_5.to_csv('/Users/notagain/data_preprocessing/text_cleaning/modified_df_5.csv', index=False)


Processing text batches: 100%|██████████| 61/61 [00:10<00:00,  5.70it/s]



Text Processing Analysis:
------------------------------------------------------------
Total rows processed: 60875
Total character difference: 9876
Average character difference per row: 0.16

Replacements by type:
------------------------------------------------------------
Currency                   6928
Ampersand                  2483
Punctuation              272679
Number Patterns          173479


100%|██████████| 60875/60875 [00:04<00:00, 12491.71it/s]


In [27]:
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
from tqdm import tqdm
from typing import List, Dict, Tuple

class TextCleaner:
    def __init__(self):
        # Define all patterns as strings
        self.number_variations = [
            (r'\bnumber/number\b', 'number'),
            (r'\bnumber%\b', 'number'),
            (r'\bnumber\)', 'number'),
            (r'\bnumber\+', 'number'),
            (r'\(\s*number\s*\)', 'number'),
            (r'\bnumber\'s?\b', 'number'),
            (r'%\s*number', 'number'),
            (r'\(\s*number', 'number'),
            (r'\bnumber\s*[\'\'\u2019]s?\b', 'number')
        ]
        
        self.parentheses_patterns = [
            (r'\([^)]*\)', ''),  # remove content within parentheses
            (r'[\(\)]', '')      # remove remaining parentheses
        ]
        
        self.standalone_chars = r'\s+([etisnlochgwbm\+/])\s+'
        self.im_replacement = r'\b(?:i\'m|im)\b'
        self.standalone_i = r'\s+i\s+'
        
        # Compile patterns for efficiency
        self.number_patterns = [(re.compile(pattern), repl) 
                              for pattern, repl in self.number_variations]
        self.parentheses_patterns = [(re.compile(pattern), repl) 
                                   for pattern, repl in self.parentheses_patterns]
        self.standalone_pattern = re.compile(self.standalone_chars, re.IGNORECASE)
        self.im_pattern = re.compile(self.im_replacement, re.IGNORECASE)
        self.i_pattern = re.compile(self.standalone_i)
        
        # Set optimal batch size
        self.cpu_count = multiprocessing.cpu_count()
        self.default_batch_size = 1000

    def process_text(self, text: str) -> Tuple[str, Dict]:
        """Process individual text with all rules"""
        text = str(text)
        replacements = {
            'number_variations': 0,
            'parentheses': 0,
            'standalone_chars': 0,
            'im_replacements': 0,
            'i_capitalizations': 0
        }

        # Store original length for comparison
        original_length = len(text)

        # 1. Handle all number variations
        for pattern, repl in self.number_patterns:
            text, count = re.subn(pattern, repl, text)
            replacements['number_variations'] += count

        # 2. Remove parentheses and their content
        for pattern, repl in self.parentheses_patterns:
            text, count = re.subn(pattern, repl, text)
            replacements['parentheses'] += count

        # 3. Replace "im" with "i am"
        text, im_count = re.subn(self.im_pattern, 'i am', text)
        replacements['im_replacements'] = im_count

        # 4. Capitalize standalone "i"
        text, i_count = re.subn(self.i_pattern, ' I ', text)
        replacements['i_capitalizations'] = i_count

        # 5. Remove standalone characters
        text, char_count = re.subn(self.standalone_pattern, ' ', text)
        replacements['standalone_chars'] = char_count

        # 6. Clean up extra whitespace
        text = ' '.join(text.split())

        return text, replacements

    def process_batch(self, texts: List[str]) -> List[Tuple[str, Dict]]:
        """Process a batch of texts"""
        return [self.process_text(text) for text in texts]

    def clean_texts(self, df: pd.DataFrame, text_column: str = 'text', 
                   batch_size: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
        """
        Clean texts with batch processing and analysis.
        """
        df_modified = df.copy()
        batch_size = batch_size or self.default_batch_size

        # Prepare batches
        texts = df[text_column].astype(str).tolist()
        batches = [texts[i:i + batch_size] 
                  for i in range(0, len(texts), batch_size)]

        # Process batches in parallel
        all_results = []
        total_replacements = {
            'number_variations': 0,
            'parentheses': 0,
            'standalone_chars': 0,
            'im_replacements': 0,
            'i_capitalizations': 0
        }

        # Process with progress bar
        with ThreadPoolExecutor(max_workers=self.cpu_count) as executor:
            futures = [executor.submit(self.process_batch, batch) 
                      for batch in batches]
            
            for future in tqdm(futures, desc="Processing text batches"):
                batch_results = future.result()
                all_results.extend(batch_results)

        # Unzip results
        processed_texts, replacements_list = zip(*all_results)

        # Update DataFrame
        df_modified[text_column] = processed_texts

        # Sum up replacements
        for rep in replacements_list:
            for key in total_replacements:
                total_replacements[key] += rep[key]

        # Create analysis DataFrame
        analysis_df = pd.DataFrame({
            'original_text': df[text_column],
            'processed_text': processed_texts,
            'chars_difference': [len(str(orig)) - len(proc) 
                               for orig, proc in zip(df[text_column], processed_texts)]
        })

        # Compile results
        results = {
            'total_rows_processed': len(df),
            'total_chars_removed': sum(analysis_df['chars_difference']),
            'replacements': total_replacements,
            'avg_chars_removed': sum(analysis_df['chars_difference']) / len(df)
        }

        return df_modified, analysis_df, results

def print_cleaning_analysis(results: Dict):
    """Print analysis of text cleaning"""
    print("\nText Cleaning Analysis:")
    print("-" * 60)
    print(f"Total rows processed: {results['total_rows_processed']}")
    print(f"Total characters removed: {results['total_chars_removed']}")
    print(f"Average characters removed per row: {results['avg_chars_removed']:.2f}")
    
    print("\nReplacements by type:")
    print("-" * 60)
    for key, value in results['replacements'].items():
        print(f"{key.replace('_', ' ').title():<25} {value:>10}")




cleaner = TextCleaner()
modified_df_6, analysis, results = cleaner.clean_texts(modified_df_5)

# Print analysis
print_cleaning_analysis(results)



count_modified_df_6 = count_words(modified_df_6, 'text')

count_modified_df_6.to_csv('/Users/notagain/data_preprocessing/text_cleaning/modified_df_6.csv', index=False)

Processing text batches: 100%|██████████| 61/61 [00:05<00:00, 10.49it/s]



Text Cleaning Analysis:
------------------------------------------------------------
Total rows processed: 60875
Total characters removed: 528648
Average characters removed per row: 8.68

Replacements by type:
------------------------------------------------------------
Number Variations               6033
Parentheses                     7150
Standalone Chars              122000
Im Replacements                  271
I Capitalizations              17412


100%|██████████| 60875/60875 [00:05<00:00, 10913.42it/s]


In [28]:

import re
import pandas as pd
from tqdm import tqdm

def clean_number_words(df: pd.DataFrame, text_column: str = 'text') -> tuple:
    """
    Replace any word containing 'number' with just 'number'.
    
    Args:
        df: Input DataFrame
        text_column: Name of text column
    
    Returns:
        tuple: (Modified DataFrame, Analysis DataFrame, Results dictionary)
    """
    # Copy DataFrame
    df_modified = df.copy()
    
    # Create pattern to match any word containing 'number'
    # \S* matches any non-whitespace characters
    # \b represents word boundary
    pattern = r'\b\S*number\S*\b|\b\S*NUMBER\S*\b'
    
    # Initialize storage for analysis
    original_words = []
    replacements_count = []
    
    # Process each text
    modified_texts = []
    print("Processing texts...")
    
    for text in tqdm(df[text_column].astype(str)):
        # Find all matches in this text
        matches = re.findall(pattern, text, re.IGNORECASE)
        
        # Store original words found
        original_words.extend(matches)
        
        # Replace all variations with 'number'
        modified_text = re.sub(pattern, 'number', text, flags=re.IGNORECASE)
        
        modified_texts.append(modified_text)
        replacements_count.append(len(matches))
    
    # Update DataFrame
    df_modified[text_column] = modified_texts
    
    # Create word frequency analysis
    word_freq = pd.Series(original_words).value_counts().reset_index()
    word_freq.columns = ['original_word', 'count']
    
    # Create analysis DataFrame
    analysis_df = pd.DataFrame({
        'original_text': df[text_column],
        'processed_text': modified_texts,
        'replacements': replacements_count
    })
    
    # Compile results
    results = {
        'total_rows_processed': len(df),
        'total_replacements': sum(replacements_count),
        'unique_variations': len(word_freq),
        'avg_replacements_per_row': sum(replacements_count) / len(df)
    }
    
    return df_modified, analysis_df, word_freq, results

def print_number_analysis(results: dict, word_freq: pd.DataFrame):
    """Print analysis of number word cleaning"""
    print("\nNumber Word Analysis:")
    print("-" * 60)
    print(f"Total rows processed: {results['total_rows_processed']}")
    print(f"Total replacements made: {results['total_replacements']}")
    print(f"Unique word variations found: {results['unique_variations']}")
    print(f"Average replacements per row: {results['avg_replacements_per_row']:.2f}")
    
    print("\nTop 20 original word variations:")
    print("-" * 60)
    print(word_freq.head(20).to_string(index=False))

# Example usage


# Process the DataFrame
modified_df_7, analysis_df, word_freq, results = clean_number_words(modified_df_6)

# Print analysis
print_number_analysis(results, word_freq)

count_modified_df_7 = count_words(modified_df_7, 'text')

count_modified_df_7.to_csv('/Users/notagain/data_preprocessing/text_cleaning/modified_df_7.csv', index=False)


Processing texts...


100%|██████████| 60875/60875 [00:03<00:00, 15245.87it/s]



Number Word Analysis:
------------------------------------------------------------
Total rows processed: 60875
Total replacements made: 168276
Unique word variations found: 375
Average replacements per row: 2.76

Top 20 original word variations:
------------------------------------------------------------
     original_word  count
            number 166951
     number/number    653
      numbernumber    138
          number°F     22
            NUMBER     19
          number°C     13
            Number     10
         number/lb      9
           Numbers      7
      number/email      5
     number/family      5
         number/mo      5
       Edit#number      4
         numberThe      4
    number/freezer      4
         numberoff      4
          number/m      3
numbernumbernumber      3
         number/hr      3
         numberand      3


100%|██████████| 60875/60875 [00:04<00:00, 12594.43it/s]


In [29]:

import re
import pandas as pd
from tqdm import tqdm

def clean_number_words(df: pd.DataFrame, text_column: str = 'text') -> tuple:
    """
    Replace any word containing 'number' anywhere in it with just 'number'.
    
    Args:
        df: Input DataFrame
        text_column: Name of text column
    
    Returns:
        tuple: (Modified DataFrame, Analysis DataFrame, Results dictionary)
    """
    # Copy DataFrame
    df_modified = df.copy()
    
    # Pattern to match any word containing 'number' anywhere within it
    # \S* matches any non-whitespace characters before and after
    pattern = r'\S*number\S*|\S*NUMBER\S*'
    
    # Initialize storage for analysis
    original_words = []
    replacements_count = []
    
    # Process each text
    modified_texts = []
    print("Processing texts...")
    
    for text in tqdm(df[text_column].astype(str)):
        # Find all matches in this text
        matches = re.findall(pattern, text, re.IGNORECASE)
        
        # Store original words found
        original_words.extend(matches)
        
        # Replace all variations with 'number'
        modified_text = re.sub(pattern, 'number', text, flags=re.IGNORECASE)
        
        modified_texts.append(modified_text)
        replacements_count.append(len(matches))
    
    # Update DataFrame
    df_modified[text_column] = modified_texts
    
    # Create word frequency analysis
    word_freq = pd.Series(original_words).value_counts().reset_index()
    word_freq.columns = ['original_word', 'count']
    
    # Create analysis DataFrame
    analysis_df = pd.DataFrame({
        'original_text': df[text_column],
        'processed_text': modified_texts,
        'replacements': replacements_count
    })
    
    # Compile results
    results = {
        'total_rows_processed': len(df),
        'total_replacements': sum(replacements_count),
        'unique_variations': len(word_freq),
        'avg_replacements_per_row': sum(replacements_count) / len(df)
    }
    
    return df_modified, analysis_df, word_freq, results

def print_number_analysis(results: dict, word_freq: pd.DataFrame):
    """Print analysis of number word cleaning"""
    print("\nNumber Word Analysis:")
    print("-" * 60)
    print(f"Total rows processed: {results['total_rows_processed']}")
    print(f"Total replacements made: {results['total_replacements']}")
    print(f"Unique word variations found: {results['unique_variations']}")
    print(f"Average replacements per row: {results['avg_replacements_per_row']:.2f}")
    
    print("\nTop 20 original word variations:")
    print("-" * 60)
    print(word_freq.head(20).to_string(index=False))

# Test DataFrame with various number patterns


# Process the DataFrame
modified_df_8, analysis_df, word_freq, results = clean_number_words(modified_df_7)

# Print analysis
print_number_analysis(results, word_freq)



# Print analysis


count_modified_df_8 = count_words(modified_df_8, 'text')

count_modified_df_8.to_csv('/Users/notagain/data_preprocessing/text_cleaning/modified_df_8.csv', index=False)


Processing texts...


100%|██████████| 60875/60875 [00:08<00:00, 6822.66it/s]



Number Word Analysis:
------------------------------------------------------------
Total rows processed: 60875
Total replacements made: 168276
Unique word variations found: 74
Average replacements per row: 2.76

Top 20 original word variations:
------------------------------------------------------------
original_word  count
       number 165703
      number%   1583
      #number    236
      £number    188
      number'     97
      number’     65
      number…     60
      number*     33
      ~number     31
      number°     31
      number/     28
      +number     21
      number€     17
      number#     16
      'number     16
      €number     14
      @number     12
     'number'     10
     number''      7
      /number      7


100%|██████████| 60875/60875 [00:05<00:00, 11877.74it/s]


In [30]:

import pandas as pd
import re
from collections import Counter
from tqdm import tqdm
from typing import Dict, Tuple

def replace_rare_words(df: pd.DataFrame, text_column: str = 'text', 
                      min_frequency: int = 10) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
    """
    Count word occurrences and replace rare words with 'unknown'.
    
    Args:
        df: Input DataFrame
        text_column: Name of text column
        min_frequency: Minimum frequency threshold (default 10)
    
    Returns:
        tuple: (Modified DataFrame, Word Frequency DataFrame, Results dictionary)
    """
    # Copy DataFrame
    df_modified = df.copy()
    
    # Initialize word counter
    word_counter = Counter()
    
    print("Counting word frequencies...")
    # First pass: count all words
    for text in tqdm(df[text_column].astype(str)):
        # Split into words and count
        words = text.split()
        word_counter.update(words)
    
    # Create sets for quick lookup
    common_words = {word for word, count in word_counter.items() 
                   if count >= min_frequency}
    
    # Create frequency DataFrame
    freq_df = pd.DataFrame({
        'word': list(word_counter.keys()),
        'frequency': list(word_counter.values())
    }).sort_values('frequency', ascending=False)
    
    print("\nReplacing rare words...")
    # Second pass: replace rare words
    modified_texts = []
    replacement_counts = []
    
    for text in tqdm(df[text_column].astype(str)):
        words = text.split()
        replacements = 0
        
        # Replace rare words
        modified_words = [
            word if word in common_words else 'unknown'
            for word in words
        ]
        
        # Count replacements
        replacements = sum(1 for orig, mod in zip(words, modified_words) 
                         if orig != mod)
        
        modified_texts.append(' '.join(modified_words))
        replacement_counts.append(replacements)
    
    # Update DataFrame
    df_modified[text_column] = modified_texts
    
    # Compile results
    results = {
        'total_words_processed': sum(word_counter.values()),
        'unique_words': len(word_counter),
        'words_above_threshold': len(common_words),
        'words_replaced': len(word_counter) - len(common_words),
        'total_replacements': sum(replacement_counts),
        'avg_replacements_per_row': sum(replacement_counts) / len(df)
    }
    
    return df_modified, freq_df, results

def print_word_analysis(results: Dict, freq_df: pd.DataFrame):
    """Print analysis of word replacement"""
    print("\nWord Frequency Analysis:")
    print("-" * 60)
    print(f"Total words processed: {results['total_words_processed']}")
    print(f"Unique words found: {results['unique_words']}")
    print(f"Words above threshold: {results['words_above_threshold']}")
    print(f"Words replaced with 'unknown': {results['words_replaced']}")
    print(f"Total replacements made: {results['total_replacements']}")
    print(f"Average replacements per row: {results['avg_replacements_per_row']:.2f}")
    
    print("\nTop 20 most frequent words:")
    print("-" * 60)
    print(freq_df.head(20).to_string(index=False))
    
    print("\nLeast frequent words (sample of 20):")
    print("-" * 60)
    print(freq_df.tail(20).to_string(index=False))

modified_df_9, freq_df, results = replace_rare_words(modified_df_8, min_frequency=10)

# Print analysis
print_word_analysis(results, freq_df)

count_modified_df_9 = count_words(modified_df_9, 'text')

count_modified_df_9.to_csv('/Users/notagain/data_preprocessing/text_cleaning/modified_df_9.csv', index=False)



Counting word frequencies...


100%|██████████| 60875/60875 [00:01<00:00, 55757.41it/s]



Replacing rare words...


100%|██████████| 60875/60875 [00:01<00:00, 58448.40it/s]



Word Frequency Analysis:
------------------------------------------------------------
Total words processed: 3764870
Unique words found: 60303
Words above threshold: 11019
Words replaced with 'unknown': 49284
Total replacements made: 105083
Average replacements per row: 1.73

Top 20 most frequent words:
------------------------------------------------------------
  word  frequency
     .     236034
number     168276
   the     144770
    to     111223
   and     106624
     a      76301
   was      62578
    of      47314
   not      47247
    it      46508
    is      43704
   for      42989
    my      41522
  that      36621
    in      33932
  with      32126
  have      32025
    on      26853
     !      25091
  they      24840

Least frequent words (sample of 20):
------------------------------------------------------------
               word  frequency
              PLUGS          1
                EAR          1
            Iyengar          1
           trainer/          1
 

100%|██████████| 60875/60875 [00:04<00:00, 12710.96it/s]
