In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

def clean_text_spacy(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

print(clean_text_spacy("This is a sample comment with a link: https://youtube.com"))


sample comment link


In [12]:
# 📦 Imports
import pandas as pd
import spacy
import emoji
import re
from pathlib import Path

# 🧠 Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# 📂 Load comments
data_path = Path("../data/comments.csv")
df = pd.read_csv(data_path)
df.dropna(subset=["comment"], inplace=True)

# 🧹 Clean function using spaCy only
def clean_text_spacy(text):
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    
    # Keep only alphabet and whitespace
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    
    # Lowercase
    text = text.lower()
    
    # spaCy processing
    doc = nlp(text)
    
    # Remove stopwords and lemmatize
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    
    return " ".join(tokens)

# 🌀 Apply cleaning to all comments
df["cleaned_comment"] = df["comment"].apply(clean_text_spacy)

# 💾 Save to new CSV
output_path = Path("../data/cleaned_comments.csv")
df.to_csv(output_path, index=False)

print(f"✅ Done! Cleaned comments saved to {output_path}")


✅ Done! Cleaned comments saved to ../data/cleaned_comments.csv
