In [None]:
import pandas as pd
import numpy as np
import regex as re
import io
from google.colab import files

In [None]:
# Upload the message_dataset_50k.csv file
uploaded = files.upload()

# Check if uploaded
if 'message_dataset_50k.csv' not in uploaded:
    raise FileNotFoundError("Please upload 'message_dataset_50k.csv'")
else:
    print("✅ File uploaded successfully!")

Saving message_dataset_50k.csv to message_dataset_50k.csv
✅ File uploaded successfully!


In [None]:
# Read the CSV — no header, two columns: message and category
df = pd.read_csv(io.BytesIO(uploaded['message_dataset_50k.csv']),
                 names=['message', 'category'],
                 header=None)

print(f"📊 Total raw messages: {len(df)}")
print("\n🔍 First few rows:")
print(df.head())

📊 Total raw messages: 50001

🔍 First few rows:
                                             message       category
0                                            Message       Category
1  Final notice. Update your info: https://verify...           Spam
2  Reset your password now at https://get-rich-fa...           Spam
3  Your transaction ID is TXN471861. Please keep ...  Transactional
4  Your package with tracking ID 162556 has been ...  Transactional


In [None]:
def clean_text_keep_urls(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    return text.lower()

# Apply cleaning
print("🧹 Cleaning messages...")
df['cleaned_message'] = df['message'].apply(clean_text_keep_urls)

🧹 Cleaning messages...


In [None]:
df.drop_duplicates(subset='message', inplace=True)

In [None]:
df[['message', 'cleaned_message', 'category']].to_csv('labeled_sms_dataset.csv', index=False)
print(f"✅ Cleaned dataset saved! Shape: {df.shape}")

✅ Cleaned dataset saved! Shape: (4221, 3)


In [None]:
files.download('labeled_sms_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# After cleaning and deduplication
final_df = df[['message', 'cleaned_message', 'category']].copy()

# Ensure clean strings
final_df['message'] = final_df['message'].astype(str)
final_df['cleaned_message'] = final_df['cleaned_message'].astype(str)
final_df['category'] = final_df['category'].astype(str)

# ✅ Save with correct settings
final_df.to_csv('labeled_sms_dataset.csv',
                index=False,
                encoding='utf-8',
                lineterminator='\n')  # ✅ Correct parameter name

# Download safely
from google.colab import files
files.download('labeled_sms_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# After cleaning and deduplication
final_df = df[['message', 'cleaned_message', 'category']].copy()

# Ensure clean strings (no NaN, no weird types)
final_df['message'] = final_df['message'].astype(str).str.strip()
final_df['cleaned_message'] = final_df['cleaned_message'].astype(str).str.strip()
final_df['category'] = final_df['category'].astype(str).str.strip()

# 🔥 CRITICAL: Remove any internal newlines within messages
final_df['message'] = final_df['message'].str.replace(r'\r|\\r|\n|\\n', ' ', regex=True)
final_df['cleaned_message'] = final_df['cleaned_message'].str.replace(r'\r|\\r|\n|\\n', ' ', regex=True)

# ✅ Save with correct settings
final_df.to_csv('labeled_sms_dataset_clean.csv',
                index=False,
                encoding='utf-8',
                lineterminator='\n')  # ← This ensures one row per line

# Download safely
from google.colab import files
files.download('labeled_sms_dataset_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df = pd.read_csv("labeled_sms_dataset_clean.csv")
print(df.head())
print(f"\n📊 Shape: {df.shape}")
print(f"Categories:\n{df['category'].value_counts()}")

                                             message  \
0                                            Message   
1  Final notice. Update your info: https://verify...   
2  Reset your password now at https://get-rich-fa...   
3  Your transaction ID is TXN471861. Please keep ...   
4  Your package with tracking ID 162556 has been ...   

                                     cleaned_message       category  
0                                            message       Category  
1  final notice. update your info: https://verify...           Spam  
2  reset your password now at https://get-rich-fa...           Spam  
3  your transaction id is txn471861. please keep ...  Transactional  
4  your package with tracking id 162556 has been ...  Transactional  

📊 Shape: (4221, 3)
Categories:
category
Transactional    4020
Spam              100
Promotional       100
Category            1
Name: count, dtype: int64


In [None]:
# Load the CSV again to verify
df = pd.read_csv("labeled_sms_dataset_clean.csv")
print(df.head())
print(f"\nShape: {df.shape}")
print("\nCategories:")
print(df['category'].value_counts())

                                             message  \
0                                            Message   
1  Final notice. Update your info: https://verify...   
2  Reset your password now at https://get-rich-fa...   
3  Your transaction ID is TXN471861. Please keep ...   
4  Your package with tracking ID 162556 has been ...   

                                     cleaned_message       category  
0                                            message       Category  
1  final notice. update your info: https://verify...           Spam  
2  reset your password now at https://get-rich-fa...           Spam  
3  your transaction id is txn471861. please keep ...  Transactional  
4  your package with tracking id 162556 has been ...  Transactional  

Shape: (4221, 3)

Categories:
category
Transactional    4020
Spam              100
Promotional       100
Category            1
Name: count, dtype: int64


In [None]:
# Save with proper settings
df.to_csv('labeled_sms_dataset_clean.csv',
          index=False,
          encoding='utf-8',
          lineterminator='\n')

In [None]:
from google.colab import files
files.download('labeled_sms_dataset_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# After cleaning and deduplication
final_df = df[['message', 'cleaned_message', 'category']].copy()

# Ensure clean strings
final_df['message'] = final_df['message'].astype(str).str.strip()
final_df['cleaned_message'] = final_df['cleaned_message'].astype(str).str.strip()
final_df['category'] = final_df['category'].astype(str).str.strip()

# 🔥 CRITICAL: Replace any internal line breaks within cells
final_df['message'] = final_df['message'].str.replace(r'\r|\\r|\n|\\n', ' ', regex=True)
final_df['cleaned_message'] = final_df['cleaned_message'].str.replace(r'\r|\\r|\n|\\n', ' ', regex=True)

# ✅ Save with correct settings
final_df.to_csv('labeled_sms_dataset_clean_final.csv',
                index=False,
                encoding='utf-8',
                lineterminator='\n')  # This ensures each row ends with a newline

# Download safely
from google.colab import files
files.download('labeled_sms_dataset_clean_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
df = pd.read_csv("labeled_sms_dataset_clean_final.csv")
print(df.head())
print(df.shape)  # Should show (~50000, 3)

                                             message  \
0                                            Message   
1  Final notice. Update your info: https://verify...   
2  Reset your password now at https://get-rich-fa...   
3  Your transaction ID is TXN471861. Please keep ...   
4  Your package with tracking ID 162556 has been ...   

                                     cleaned_message       category  
0                                            message       Category  
1  final notice. update your info: https://verify...           Spam  
2  reset your password now at https://get-rich-fa...           Spam  
3  your transaction id is txn471861. please keep ...  Transactional  
4  your package with tracking id 162556 has been ...  Transactional  
(4221, 3)


In [None]:
# After cleaning and deduplication
final_df = df[['message', 'cleaned_message', 'category']].copy()

# Ensure clean strings
final_df['message'] = final_df['message'].astype(str).str.strip()
final_df['cleaned_message'] = final_df['cleaned_message'].astype(str).str.strip()
final_df['category'] = final_df['category'].astype(str).str.strip()

# Remove any internal newlines within messages
final_df['message'] = final_df['message'].str.replace(r'\r|\n|\\r|\\n', ' ', regex=True)
final_df['cleaned_message'] = final_df['cleaned_message'].str.replace(r'\r|\n|\\r|\\n', ' ', regex=True)

# ✅ Save with correct settings
final_df.to_csv('labeled_sms_dataset_clean_1.csv',
                index=False,
                encoding='utf-8',
                lineterminator='\n')  # Ensures proper newlines

# Download safely
from google.colab import files
files.download('labeled_sms_dataset_clean_1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
df = pd.read_csv("labeled_sms_dataset_clean_1.csv")
print(df.head())
print(f"\nShape: {df.shape}")
print("\nCategories:")
print(df['category'].value_counts())

                                             message  \
0                                            Message   
1  Final notice. Update your info: https://verify...   
2  Reset your password now at https://get-rich-fa...   
3  Your transaction ID is TXN471861. Please keep ...   
4  Your package with tracking ID 162556 has been ...   

                                     cleaned_message       category  
0                                            message       Category  
1  final notice. update your info: https://verify...           Spam  
2  reset your password now at https://get-rich-fa...           Spam  
3  your transaction id is txn471861. please keep ...  Transactional  
4  your package with tracking id 162556 has been ...  Transactional  

Shape: (4221, 3)

Categories:
category
Transactional    4020
Spam              100
Promotional       100
Category            1
Name: count, dtype: int64


In [None]:
# After cleaning and deduplication
final_df = df[['message', 'cleaned_message', 'category']].copy()

# Ensure clean strings
final_df['message'] = final_df['message'].astype(str).str.strip()
final_df['cleaned_message'] = final_df['cleaned_message'].astype(str).str.strip()
final_df['category'] = final_df['category'].astype(str).str.strip()

# Remove any internal newlines or carriage returns within messages
final_df['message'] = final_df['message'].str.replace(r'\r|\n|\\r|\\n', ' ', regex=True)
final_df['cleaned_message'] = final_df['cleaned_message'].str.replace(r'\r|\n|\\r|\\n', ' ', regex=True)

# ✅ Save with CORRECT settings
final_df.to_csv('labeled_sms_dataset_CLEAN.csv',
                index=False,
                encoding='utf-8',
                lineterminator='\n')  # ← Critical!

# Download
from google.colab import files
files.download('labeled_sms_dataset_CLEAN.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
import re

# Upload your file
uploaded = files.upload()

# Get filename (you can rename it manually if needed)
filename = list(uploaded.keys())[0]
filename = "labeled_sms_dataset_cleaned.csv"
print(f"✅ Uploaded: {filename}")

# Read entire content as string
with open(filename, 'r', encoding='utf-8') as f:
    raw_text = f.read().strip()

print("🔍 First 200 chars:", raw_text[:200])

Saving labeled_sms_dataset_cleaned.csv to labeled_sms_dataset_cleaned.csv


NameError: name 'labeled_sms_dataset_cleaned' is not defined

In [None]:
filename = list(uploaded.keys())[0]
filename = "labeled_sms_dataset_cleaned.csv"
print(f"✅ Uploaded: {filename}")

# Read entire content as string
with open(filename, 'r', encoding='utf-8') as f:
    raw_text = f.read().strip()

print("🔍 First 200 chars:", raw_text[:200])

✅ Uploaded: labeled_sms_dataset_cleaned.csv
🔍 First 200 chars: message,cleaned_message,category
Final notice. Update your info: https://verify-now.online,final notice. update your info: https://verify-now.online,Spam
Reset your password now at https://get-rich-fa


In [None]:
# Add newline at start for easier processing
text = "\n" + raw_text

# Use regex to split wherever a new message begins (after a category)
# Each message has: [original],[cleaned],[category]
pattern = r'(?:\n|,)(?=Your |Reset |Final notice|Enjoy |\w.*?:|\d{4,})'

# But better approach: Insert \n before every new "original message" that follows a category
# We detect patterns like: ...TransactionalYour OTP → should be ...Transactional\nYour OTP

fixed = re.sub(r'(,Spam)(?=Your |Reset |Final notice|Check |Enjoy |Your package)', r'\1\n', raw_text)
fixed = re.sub(r'(,Transactional)(?=Your |Reset |Final notice|Check |Enjoy |Your package)', r'\1\n', fixed)
fixed = re.sub(r'(,Promotional)(?=Check |Enjoy |Limited time|Buy now)', r'\1\n', fixed)

# Now every message is on its own line
lines = [line.strip() for line in fixed.split('\n') if line.strip()]
print(f"✅ Found {len(lines)} reconstructed messages")

✅ Found 4221 reconstructed messages


In [None]:
import pandas as pd

data = []
for line in lines:
    # Split by comma
    parts = line.split(',')
    if len(parts) >= 3:
        message = parts[0].strip()
        cleaned = parts[1].strip()
        cat = parts[2].strip()
        # Remove any trailing letters if category got merged with next message
        if cat in ['Spam', 'Transactional', 'Promotional']:
            data.append([message, cleaned, cat])

# Create DataFrame
df = pd.DataFrame(data, columns=['message', 'cleaned_message', 'category'])
print(f"📊 Final dataset shape: {df.shape}")
print("\n📋 First few rows:")
print(df.head())

📊 Final dataset shape: (4210, 3)

📋 First few rows:
                                             message  \
0  Final notice. Update your info: https://verify...   
1  Reset your password now at https://get-rich-fa...   
2  Your transaction ID is TXN471861. Please keep ...   
3  Your package with tracking ID 162556 has been ...   
4  Earn money fast! Start today: https://login-no...   

                                     cleaned_message       category  
0  final notice. update your info: https://verify...           Spam  
1  reset your password now at https://get-rich-fa...           Spam  
2  your transaction id is txn471861. please keep ...  Transactional  
3  your package with tracking id 162556 has been ...  Transactional  
4  earn money fast! start today: https://login-no...           Spam  


In [None]:
print("\n🔢 Category distribution:")
print(df['category'].value_counts())


🔢 Category distribution:
category
Transactional    4010
Spam              100
Promotional       100
Name: count, dtype: int64


In [None]:
# Ensure clean strings
df['message'] = df['message'].astype(str).str.strip()
df['cleaned_message'] = df['cleaned_message'].astype(str).str.strip()
df['category'] = df['category'].astype(str).str.strip()

# Remove internal weird characters
df['message'] = df['message'].str.replace(r'[\r\n]+', ' ', regex=True)
df['cleaned_message'] = df['cleaned_message'].str.replace(r'[\r\n]+', ' ', regex=True)

# ✅ Export with correct settings
df.to_csv('labeled_sms_dataset_CLEAN.csv',
          index=False,
          encoding='utf-8',
          lineterminator='\n')

# Download
files.download('labeled_sms_dataset_CLEAN.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Test reload
test_df = pd.read_csv('labeled_sms_dataset_CLEAN.csv')
print(test_df.head())
print(f"\nShape: {test_df.shape}")

                                             message  \
0  Final notice. Update your info: https://verify...   
1  Reset your password now at https://get-rich-fa...   
2  Your transaction ID is TXN471861. Please keep ...   
3  Your package with tracking ID 162556 has been ...   
4  Earn money fast! Start today: https://login-no...   

                                     cleaned_message       category  
0  final notice. update your info: https://verify...           Spam  
1  reset your password now at https://get-rich-fa...           Spam  
2  your transaction id is txn471861. please keep ...  Transactional  
3  your package with tracking id 162556 has been ...  Transactional  
4  earn money fast! start today: https://login-no...           Spam  

Shape: (4210, 3)


In [None]:
import pandas as pd

# Load your current cleaned dataset
df = pd.read_csv("labeled_sms_dataset_CLEAN.csv")

# Check counts
print(df['category'].value_counts())

category
Transactional    4010
Spam              100
Promotional       100
Name: count, dtype: int64


In [None]:
from sklearn.utils import resample

# Separate each class
df_transactional = df[df['category'] == 'Transactional']
df_spam = df[df['category'] == 'Spam']
df_promo = df[df['category'] == 'Promotional']

# Upsample spam and promo to match Transactional (4010)
df_spam_upsampled = resample(df_spam, replace=True, n_samples=4010, random_state=42)
df_promo_upsampled = resample(df_promo, replace=True, n_samples=4010, random_state=42)

# Combine all into a new balanced dataframe
df_balanced = pd.concat([df_transactional, df_spam_upsampled, df_promo_upsampled])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new counts
print(df_balanced['category'].value_counts())


category
Spam             4010
Promotional      4010
Transactional    4010
Name: count, dtype: int64


In [None]:
df_balanced.to_csv("labeled_sms_dataset_BALANCED.csv", index=False)
