# Programming Assignment 2
## NLP Data Cleaning & Evaluation

This notebook performs text cleaning, normalization, and evaluation using NLP techniques.

In [1]:
import pandas as pd
import re
import time
import nltk
import textstat

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
CSV_PATH = "Sentiment Analysis Dataset.csv"
TEXT_COL = "SentimentText"

STOP_WORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()

In [4]:
EMOTICON_RE = re.compile(r"(:\)|:\(|:D|;\)|<3)")
SPECIAL_CHAR_RE = re.compile(r"[^a-zA-Z\s]")
PHONE_RE = re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b")
ACCOUNT_RE = re.compile(r"\b\d{10,16}\b")
ADDRESS_RE = re.compile(r"\b(street|road|rd|ave|avenue|lane|ln)\b", re.I)

In [5]:
def basic_stats(texts):
    sentences, words = [], []
    for t in texts:
        sents = sent_tokenize(str(t))
        sentences.extend(sents)
        for s in sents:
            words.extend(word_tokenize(s))
    vocab = set(words)
    sent_lengths = [len(word_tokenize(s)) for s in sentences if s.strip()]
    return {
        "sentence_count": len(sentences),
        "word_count": len(words),
        "vocab_size": len(vocab),
        "avg_sentence_length": sum(sent_lengths) / len(sent_lengths),
        "max_sentence_length": max(sent_lengths),
        "min_sentence_length": min(sent_lengths),
        "max_word_length": max(len(w) for w in words)
    }

In [6]:
def clean_text(text, counters):
    text = str(text)
    counters['emoticon_removed'] += len(EMOTICON_RE.findall(text))
    counters['phone_removed'] += len(PHONE_RE.findall(text))
    counters['account_removed'] += len(ACCOUNT_RE.findall(text))
    counters['address_removed'] += len(ADDRESS_RE.findall(text))

    text = EMOTICON_RE.sub(' ', text)
    text = PHONE_RE.sub(' ', text)
    text = ACCOUNT_RE.sub(' ', text)
    text = ADDRESS_RE.sub(' ', text)

    text = text.lower()
    counters['special_char_removed'] += len(SPECIAL_CHAR_RE.findall(text))
    text = SPECIAL_CHAR_RE.sub(' ', text)

    tokens = word_tokenize(text)
    cleaned_tokens = []

    for t in tokens:
        if t not in STOP_WORDS:
            cleaned_tokens.append(LEMMATIZER.lemmatize(t))
        else:
            counters['stopword_removed'] += 1

    counters['token_count'] += len(cleaned_tokens)
    return ' '.join(cleaned_tokens)

In [7]:
def readability_per_sentence(texts):
    scores = []
    for t in texts:
        for s in sent_tokenize(str(t)):
            if len(s.split()) > 2:
                scores.append(textstat.flesch_kincaid_grade(s))
    return sum(scores) / len(scores)

In [8]:
def lexical_diversity(texts):
    words = []
    for t in texts:
        words.extend(word_tokenize(str(t)))
    return len(set(words)) / len(words)

In [9]:
df = pd.read_csv(CSV_PATH, encoding='latin1')
original_texts = df[TEXT_COL].dropna().tolist()

In [10]:
before_stats = basic_stats(original_texts)
before_readability = readability_per_sentence(original_texts)
before_lexical = lexical_diversity(original_texts)
before_stats, before_readability, before_lexical

({'sentence_count': 1814099,
  'word_count': 17582802,
  'vocab_size': 696649,
  'avg_sentence_length': 9.692305656968005,
  'max_sentence_length': 229,
  'min_sentence_length': 1,
  'max_word_length': 136},
 4.082191271973262,
 0.039630998459033694)

In [11]:
start_time = time.time()

counters = {
    'emoticon_removed': 0,
    'stopword_removed': 0,
    'token_count': 0,
    'special_char_removed': 0,
    'phone_removed': 0,
    'account_removed': 0,
    'address_removed': 0
}

df['clean_text'] = df[TEXT_COL].apply(lambda x: clean_text(x, counters))
clean_texts = df['clean_text'].dropna().tolist()
runtime = time.time() - start_time

In [12]:
after_stats = basic_stats(clean_texts)
after_readability = readability_per_sentence(clean_texts)
after_lexical = lexical_diversity(clean_texts)
after_stats, after_readability, after_lexical

({'sentence_count': 1048384,
  'word_count': 8192895,
  'vocab_size': 476685,
  'avg_sentence_length': 7.81478446828643,
  'max_sentence_length': 78,
  'min_sentence_length': 1,
  'max_word_length': 117},
 6.425984062831788,
 0.058182730280322156)

In [13]:
with open('cleaned_corpus.txt', 'w', encoding='utf-8') as f:
    for s in clean_texts:
        f.write(s + '\n')

In [14]:
print('Before Cleaning:', before_stats)
print('After Cleaning:', after_stats)
print(f'Readability Before: {before_readability:.2f}')
print(f'Readability After : {after_readability:.2f}')
print(f'Lexical Diversity Before: {before_lexical:.3f}')
print(f'Lexical Diversity After : {after_lexical:.3f}')
print(f'Runtime: {runtime:.2f} seconds')
print('Counters:', counters)

Before Cleaning: {'sentence_count': 1814099, 'word_count': 17582802, 'vocab_size': 696649, 'avg_sentence_length': 9.692305656968005, 'max_sentence_length': 229, 'min_sentence_length': 1, 'max_word_length': 136}
After Cleaning: {'sentence_count': 1048384, 'word_count': 8192895, 'vocab_size': 476685, 'avg_sentence_length': 7.81478446828643, 'max_sentence_length': 78, 'min_sentence_length': 1, 'max_word_length': 117}
Readability Before: 4.08
Readability After : 6.43
Lexical Diversity Before: 0.040
Lexical Diversity After : 0.058
Runtime: 157.66 seconds
Counters: {'emoticon_removed': 5860, 'stopword_removed': 6424510, 'token_count': 8192895, 'special_char_removed': 5051045, 'phone_removed': 356, 'account_removed': 361, 'address_removed': 2596}
