<a href="https://colab.research.google.com/github/Shantanuraje18/Codeit-Internship-AIML/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data handeling**

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# 1. Resource Download
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

# 2. Import CSV from Kagglehub
import kagglehub
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")
# Load the CSV (using encoding='latin-1' as common for this dataset)
df = pd.read_csv(f"{path}/spam.csv", encoding='latin-1')

# 3. Clean the DataFrame (Keep only the text column)
df = df[['v2']].rename(columns={'v2': 'raw_text'})
# We will use the first 150 entries to meet your "150 lines" request
df = df.head(150)

print(f"Successfully imported {len(df)} lines of text from CSV.")
display(df.head())

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.
Successfully imported 150 lines of text from CSV.


Unnamed: 0,raw_text
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


**Lowercasing,Handeling Contraction,Tokenization**

In [None]:
# Step 1: Lowercasing
df['lowered'] = df['raw_text'].str.lower()

# Step 2: Handling Contractions (Expanding common English short-forms)
contractions = {"i'm": "i am", "it's": "it is", "can't": "cannot", "don't": "do not"}
def expand_contractions(text):
    for key, value in contractions.items():
        text = text.replace(key, value)
    return text

df['expanded'] = df['lowered'].apply(expand_contractions)

# Step 3: Tokenization
# Breaking the continuous stream into a list of words
df['tokens'] = df['expanded'].apply(word_tokenize)

print("Step 1-3 Complete: Lowering, Contractions, and Tokenization.")

Step 1-3 Complete: Lowering, Contractions, and Tokenization.


**Noise removal,Stopword Removal**

In [None]:
# Step 4: Noise Removal (Removing punctuation and non-alphabetic tokens)
def remove_noise(tokens):
    return [w for w in tokens if w.isalpha()]

df['noise_removed'] = df['tokens'].apply(remove_noise)

# Step 5: Stopword Removal
# Deleting common words like 'the', 'is', 'a'
stop_words = set(stopwords.words('english'))
def remove_stops(tokens):
    return [w for w in tokens if w not in stop_words]

df['filtered'] = df['noise_removed'].apply(remove_stops)

print("Step 4-5 Complete: Noise and Stopword Removal.")

Step 4-5 Complete: Noise and Stopword Removal.


**Stemming,Lemmatization**

In [None]:
# Step 6: Stemming & Lemmatization
ps = PorterStemmer() # Crude chopping
lemmatizer = WordNetLemmatizer() # Dictionary-based root finding

def apply_stem_lem(tokens):
    return [(w, ps.stem(w), lemmatizer.lemmatize(w)) for w in tokens]

df['final_transform'] = df['filtered'].apply(apply_stem_lem)

# BETTER REPRESENTATION: Detailed Comparison Table
# Let's explode the first few sentences to see word-by-word changes
comparison_list = []
for index, row in df.head(5).iterrows():
    for original, stemmed, lemmed in row['final_transform']:
        comparison_list.append({
            "Sentence_ID": index + 1,
            "Cleaned_Word": original,
            "Stemmed_Root": stemmed,
            "Lemmatized_Root": lemmed
        })

representation_df = pd.DataFrame(comparison_list)
print("--- FINAL DATA REPRESENTATION (Step-by-Step Word Analysis) ---")
display(representation_df.head(20))

--- FINAL DATA REPRESENTATION (Step-by-Step Word Analysis) ---


Unnamed: 0,Sentence_ID,Cleaned_Word,Stemmed_Root,Lemmatized_Root
0,1,go,go,go
1,1,jurong,jurong,jurong
2,1,point,point,point
3,1,crazy,crazi,crazy
4,1,available,avail,available
5,1,bugis,bugi,bugis
6,1,n,n,n
7,1,great,great,great
8,1,world,world,world
9,1,la,la,la
