In [6]:
# !pip install py-readability-metrics

In [7]:
import pandas as pd
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from readability import Readability
from time import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ExPertComputer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ExPertComputer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Data

In [8]:
column_names=['index','timestamp','query','query_type','username','text']
test_df=pd.read_csv('test.csv',encoding='latin1',names=column_names)
train_df=pd.read_csv('train.csv',encoding='latin1',names=column_names)

### Text Cleaning

In [9]:
def clean_text(text):
    original_length=len(text)
    text=re.sub(r'[^a-zA-Z\s]','',text) 
    text=re.sub(r'\s+',' ',text).strip()
    cleaned_length=len(text)
    return text,original_length-cleaned_length

### Tokenization

In [10]:
def tokenize_text(text):
    sentences=sent_tokenize(text)
    words=word_tokenize(text)
    return sentences,words

### Lowercasing and Stopword removal 

In [11]:
def process_tokens(tokens):
    stop_words=set(stopwords.words('english'))
    tokens_lower=[word.lower() for word in tokens]
    filtered_tokens=[word for word in tokens_lower if word not in stop_words]
    return filtered_tokens,len(tokens)-len(filtered_tokens)


### Emoticons, Stemmming and Lemmatization

In [12]:
def process_advanced(tokens):
    stemmer=PorterStemmer()
    lemmatizer=WordNetLemmatizer()
    stems=[stemmer.stem(word) for word in tokens]
    lemmas=[lemmatizer.lemmatize(word) for word in tokens]
    return stems,lemmas

### Phone number, account, and address

In [13]:
def detect_sensitive_info(text):
    address_pattern=r'\d+\s+[a-zA-Z]+\s+(Street|St|Avenue|Ave|Road|Rd|Lane|Ln|Boulevard|Blvd|Drive|Dr)'
    phone_pattern=r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    account_pattern=r'\b\d{10,}\b'
    addresses=re.findall(address_pattern,text)
    phones=re.findall(phone_pattern,text)
    accounts=re.findall(account_pattern,text)
    return len(addresses),len(phones),len(accounts)

### Statistics Calculation

In [14]:
def calculate_statistics(text,tokens,sentences):
    unique_words=set(tokens)
    sentence_lengths=[len(word_tokenize(sent)) for sent in sentences]
    return {
        'average_sentence_length': sum(sentence_lengths)/len(sentences),
        'word_count':len(tokens),
        'sentence_count':len(sentences),
        'vocabulary_size':len(unique_words),
        'max_word_length':max(len(word) for word in tokens),
        'min_sentence_length':min(sentence_lengths),
        'max_sentence_length':max(sentence_lengths)

    }

### Automated Metrics

In [15]:
def evaluate_metrics(text):
    if len(word_tokenize(text)) < 100:
        return None, None  # Default values for short texts
    r = Readability(text)
    fk = r.flesch_kincaid()
    lexical_diversity = len(set(word_tokenize(text))) / len(word_tokenize(text))
    return fk.score, lexical_diversity

### After Cleaning Operation

In [16]:
import time
from collections import Counter

def preprocess_data(data):
    # Initialize metrics
    special_chars_removed = 0
    stop_words_removed = 0
    addresses_detected = 0
    phones_detected = 0
    accounts_detected = 0

    # Initialize stats for before and after cleaning
    before_stats = {
        'doc_count': len(data),
        'word_count': 0,
        'sentence_count': 0,
        'vocabulary_size': 0,
        'max_word_length': 0,
        'min_sentence_length': float('inf'),
        'max_sentence_length': 0,
        'avg_sentence_length': 0
    }
    after_stats = before_stats.copy()
    vocabulary = Counter()

    start_time = time.time()

    results = []
    for index, row in data.iterrows():
        text = row['text']

        # Step 1: Collect before-cleaning stats
        sentences, tokens = tokenize_text(text)
        before_stats['word_count'] += len(tokens)
        before_stats['sentence_count'] += len(sentences)
        vocabulary.update(tokens)
        before_stats['max_word_length'] = max(before_stats['max_word_length'], max(len(word) for word in tokens))
        sentence_lengths = [len(sentence.split()) for sentence in sentences]
        before_stats['min_sentence_length'] = min(before_stats['min_sentence_length'], min(sentence_lengths, default=0))
        before_stats['max_sentence_length'] = max(before_stats['max_sentence_length'], max(sentence_lengths, default=0))

        # Step 2: Clean Text
        cleaned_text, chars_removed = clean_text(text)
        special_chars_removed += chars_removed

        # Skip processing if text has fewer than 100 words
        if len(word_tokenize(cleaned_text)) < 100:
            results.append({
                'cleaned_text': cleaned_text,
                'stats': None,
                'readability': None,
                'lexical_diversity': None
            })
            continue

        # Step 3: Detect sensitive info
        addr_count, phone_count, acc_count = detect_sensitive_info(text)
        addresses_detected += addr_count
        phones_detected += phone_count
        accounts_detected += acc_count

        # Step 4: Process tokens after cleaning
        sentences, tokens = tokenize_text(cleaned_text)
        tokens_processed, stops_removed = process_tokens(tokens)
        stop_words_removed += stops_removed
        vocabulary.update(tokens_processed)

        # Collect after-cleaning stats
        after_stats['word_count'] += len(tokens_processed)
        after_stats['sentence_count'] += len(sentences)
        after_stats['max_word_length'] = max(after_stats['max_word_length'], max(len(word) for word in tokens_processed))
        sentence_lengths = [len(sentence.split()) for sentence in sentences]
        after_stats['min_sentence_length'] = min(after_stats['min_sentence_length'], min(sentence_lengths, default=0))
        after_stats['max_sentence_length'] = max(after_stats['max_sentence_length'], max(sentence_lengths, default=0))

        stats = calculate_statistics(cleaned_text, tokens_processed, sentences)
        readability, lexical_diversity = evaluate_metrics(cleaned_text)

        results.append({
            'cleaned_text': cleaned_text,
            'stats': stats,
            'readability': readability,
            'lexical_diversity': lexical_diversity
        })

    # Calculate vocabulary size
    before_stats['vocabulary_size'] = len(vocabulary.keys())
    after_stats['vocabulary_size'] = len(vocabulary.keys())

    # Calculate average sentence length
    before_stats['avg_sentence_length'] = before_stats['word_count'] / max(before_stats['sentence_count'], 1)
    after_stats['avg_sentence_length'] = after_stats['word_count'] / max(after_stats['sentence_count'], 1)

    # Total runtime
    runtime = (time.time() - start_time) / 60  # Convert seconds to minutes

    # Return results and metrics
    cleaning_metrics = {
        'special_chars_removed': special_chars_removed,
        'stop_words_removed': stop_words_removed,
        'addresses_detected': addresses_detected,
        'phones_detected': phones_detected,
        'accounts_detected': accounts_detected
    }
    return results, before_stats, after_stats, cleaning_metrics, runtime


### Output Display Function

In [None]:
def display_cleaning_summary(before_stats, after_stats, cleaning_metrics, runtime):
    # Generate the formatted output
    output = f"""
### Text Cleaning Statistics ###

- Number of documents: {before_stats['doc_count']} → {after_stats['doc_count']}
- Average sentence length: {before_stats['avg_sentence_length']:.1f} → {after_stats['avg_sentence_length']:.1f}
- Word count: {before_stats['word_count']} → {after_stats['word_count']}
- Sentence count: {before_stats['sentence_count']} → {after_stats['sentence_count']}
- Vocabulary size: {before_stats['vocabulary_size']} → {after_stats['vocabulary_size']}
- Max word length: {before_stats['max_word_length']} → {after_stats['max_word_length']}
- Min sentence length: {before_stats['min_sentence_length']} → {after_stats['min_sentence_length']}
- Max sentence length: {before_stats['max_sentence_length']} → {after_stats['max_sentence_length']}
- Special characters removed: {cleaning_metrics['special_chars_removed']}
- Stop words removed: {cleaning_metrics['stop_words_removed']}
- Addresses detected: {cleaning_metrics['addresses_detected']}
- Phone numbers detected: {cleaning_metrics['phones_detected']}
- Account numbers detected: {cleaning_metrics['accounts_detected']}
- Total runtime: {runtime:.1f} minutes
    """
    print(output)


### Final Result of trained data

In [None]:
# Assuming `data` is a DataFrame with a 'text' column
results, before_stats, after_stats, cleaning_metrics, runtime = preprocess_data(train_df)
display_cleaning_summary(before_stats, after_stats, cleaning_metrics, runtime)


### Final Result of test data

In [None]:
# Assuming `data` is a DataFrame with a 'text' column
results, before_stats, after_stats, cleaning_metrics, runtime = preprocess_data(test_df)
display_cleaning_summary(before_stats, after_stats, cleaning_metrics, runtime)
