In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Load dataset
df = pd.read_csv("UNITENReview.csv")

# Check for missing values
missing_values = df["Review"].isnull().sum()

# Check for empty or whitespace-only reviews
empty_reviews = (df["Review"].str.strip() == "").sum()

# Check for special characters and punctuation
special_chars = df["Review"].str.contains(r"[^\w\s]", regex=True).sum()

# Check for duplicate reviews
duplicate_reviews = df["Review"].duplicated().sum()

# Check for numeric values
numeric_values = df["Review"].str.contains(r"\d", regex=True).sum()

# Check for URLs
url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
url_count = df["Review"].str.contains(url_pattern, regex=True).sum()

# Check for HTML tags
html_tags = df["Review"].apply(lambda x: bool(BeautifulSoup(x, "html.parser").find())).sum()

# Check for emojis
emoji_pattern = r"[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|[\U0001F680-\U0001F6FF]|[\U0001F700-\U0001F77F]|[\U0001F780-\U0001F7FF]|[\U0001F800-\U0001F8FF]|[\U0001F900-\U0001F9FF]|[\U0001FA00-\U0001FA6F]|[\U0001FA70-\U0001FAFF]|[\U00002702-\U000027B0]|[\U000024C2-\U0001F251]"
emoji_count = df["Review"].str.contains(emoji_pattern, regex=True).sum()

# Display results
print(f"Missing Values: {missing_values}")
print(f"Empty or Whitespace-Only Reviews: {empty_reviews}")
print(f"Special Characters & Punctuation: {special_chars}")
print(f"Duplicate Reviews: {duplicate_reviews}")
print(f"Numeric Values: {numeric_values}")
print(f"URLs: {url_count}")
print(f"HTML Tags: {html_tags}")
print(f"Emojis: {emoji_count}")


Missing Values: 0
Empty or Whitespace-Only Reviews: 0
Special Characters & Punctuation: 42
Duplicate Reviews: 0
Numeric Values: 4
URLs: 0
HTML Tags: 0
Emojis: 1


  html_tags = df["Review"].apply(lambda x: bool(BeautifulSoup(x, "html.parser").find())).sum()


In [33]:
import pandas as pd
import re
import emoji
import string
import nltk

from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')                    # For lemmatization
nltk.download('omw-1.4')                     # WordNet lexical database
nltk.download('averaged_perceptron_tagger_eng')  # For POS tagging
nltk.download('punkt_tab')                       # For tokenization


# Initialize tools
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Dictionary of slang words and their replacements
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing",
    "w": "winner",
    "uniten": "uniten",
    "till": "until"
}

# Contractions dictionary
contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "I'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

# Remove any URLs that start with "http" or "www" from the text
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)
    
# extracts only the text, removing all HTML tags
def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# replace emoji with ''
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Replace internet slang/chat words
def replace_slang(text):
    # Create a list of escaped slang words
    escaped_slang_words = []  # Empty list to store escaped slang words

    for word in slang_dict.keys():
        escaped_word = re.escape(word)  # Ensure special characters are escaped
        escaped_slang_words.append(escaped_word)  # Add to list

    # Join the words using '|'
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'

    # Define a replacement function
    def replace_match(match):
        slang_word = match.group(0)  # Extract matched slang word
        return slang_dict[slang_word.lower()]  # Replace with full form

    # Use regex to replace slang words with full forms
    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)

    return replaced_text

# Function to expand contractions
# Build the regex pattern for contractions
escaped_contractions = []  # List to store escaped contractions

for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)  # Escape special characters (e.g., apostrophes)
    escaped_contractions.append(escaped_contraction)  # Add to list

# Join the escaped contractions with '|'
joined_contractions = "|".join(escaped_contractions)

# Create a regex pattern with word boundaries (\b)
contractions_pattern = r'\b(' + joined_contractions + r')\b'

# Compile the regex
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE)

# Define a function to replace contractions
def replace_contractions(text):
    # Function to handle each match found
    def replace_match(match):
        matched_word = match.group(0)  # Extract matched contraction
        lower_matched_word = matched_word.lower()  # Convert to lowercase
        expanded_form = contractions_dict[lower_matched_word]  # Get full form from dictionary
        return expanded_form  # Return the expanded form

    # Apply regex substitution
    expanded_text = compiled_pattern.sub(replace_match, text)

    return expanded_text  # Return modified text

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Function to remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Function to correct spelling using AutoCorrect
def correct_spelling(text):
    return spell(text)  # Apply correction

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):  # Verb
        return wordnet.VERB
    elif nltk_tag.startswith('N'):  # Noun
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Function to lemmatize text with POS tagging
def lemmatize_text(text):
    if not isinstance(text, str):  # Ensure input is a string
        return ""

    words = word_tokenize(text)  # Tokenize text into words
    pos_tags = pos_tag(words)  # Get POS tags
    
    # Lemmatize each word with its correct POS tag
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return " ".join(lemmatized_words)  # Join words back into a sentence

# Function to tokenize text
def tokenize_text(text):
    if not isinstance(text, str):  # Ensure the input is a string
        return []
    return word_tokenize(text)  # Tokenize text into words

# Function to apply all preprocessing steps
def preprocess_text(text):
    text = text.lower()           # Step 1: Lowercasing
    text = remove_urls(text)               # Step 2: Remove URLs
    text = remove_html(text)               # Step 3: Remove HTML tags
    text = remove_emojis(text)             # Step 4: Remove Emojis
    text = replace_slang(text)             # Step 5: Replace Slang
    text = replace_contractions(text)       # Step 6: Expand Contractions
    text = remove_punctuation(text)        # Step 7: Remove Punctuation
    text = remove_numbers(text)            # Step 8: Remove Numbers
    text = correct_spelling(text)          # Step 9: Correct Spelling
    text = remove_stopwords(text)          # Step 10: Remove Stopwords
    text = lemmatize_text(text)            # Step 11: Lemmatization
    text = tokenize_text(text)             # Step 12: Tokenization
    return text

# Load dataset
df = pd.read_csv("UNITENReview.csv")  

# Apply preprocessing pipeline
df["Cleaned"] = df["Review"].apply(preprocess_text)

# Display the first few rows
print(df[["Review", "Cleaned"]].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mdade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  return BeautifulSoup(text, "html.parser").get_text()


                                                                                                                                                                                                                                                                                                                                                         Review  \
0                                                                                                                                                                                                                                                                                                          Im happy with uniten actually, even the people are W   
1                                                                                                                                                                                                                                                                                        I’m havin

In [35]:
# Save the cleaned dataset
df.to_csv("UNITENReview_Cleaned.csv", index=False)