# Notebook para preprocesar los datos provenientes de Reddit y Twitter

In [2]:
import re
import emoji
import contractions
import pandas as pd
import spacy
from tqdm import tqdm

In [3]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [4]:
# Regex patterns
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
SYMBOL_PATTERN = re.compile(r'[^a-zA-Z0-9\s]')

def normalize_text(text, remove_emoji=True):
    """Lowercase, expand contractions, remove URLs and symbols."""
    if not isinstance(text, str):
        return ""
    
    # Lowercase
    text = text.lower()

    # Expand contractions (e.g. "don't" -> "do not")
    text = contractions.fix(text)

    # Remove URLs
    text = URL_PATTERN.sub('', text)

    # Optionally remove emojis
    if remove_emoji:
        text = emoji.replace_emoji(text, replace='')

    # Remove unnecessary symbols (keep alphanumeric and whitespace)
    text = SYMBOL_PATTERN.sub(' ', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def lemmatize_text(text):
    """Tokenize and lemmatize using spaCy."""
    if not isinstance(text, str):
        return ""
    
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(lemmas)

In [5]:
def preprocess_dataframe(df, text_columns):
    """Apply normalization and lemmatization to specified text columns."""
    tqdm.pandas()

    for col in text_columns:
        print(f"→ Preprocessing column: {col}")
        df[f"{col}_clean"] = df[col].progress_apply(normalize_text)
        df[f"{col}_lemma"] = df[f"{col}_clean"].progress_apply(lemmatize_text)
    
    return df

In [6]:
df = pd.read_csv('../data/interim/IBD_estructured_text.csv')
df.head()

Unnamed: 0,id,subreddit,author,title,selftext,created_utc,comments,cuerpo
0,dflwn,CrohnsDisease,zakool21,Don't be afraid of diagnostic procedures....,"I'm not likely to frontpage this subreddit, bu...",2010-09-18 11:59:11,"['I have to agree, the worse part of a Colonos...",TÍTULO:\nDon't be afraid of diagnostic procedu...
1,dfyy1,CrohnsDisease,sphinctersayzwha,Mayo Clinic article on Crohn's Disease.,Mayo Clinic article on Crohn's Disease.,2010-09-19 15:11:21,[],TÍTULO:\nMayo Clinic article on Crohn's Diseas...
2,dfz1m,CrohnsDisease,WeDeserveDessert,Has anyone else here taken Remicade? What did...,Has anyone else here taken Remicade? What did...,2010-09-19 15:19:54,"[""Remicade (and a bypass) changed my life. Wh...",TÍTULO:\nHas anyone else here taken Remicade? ...
3,dh6zd,CrohnsDisease,RosenTurd,"I was Diagnosed With Crohns Disease at age 9 ,...",Like the title Says AMA,2010-09-22 04:50:57,"[""My son was diagnosed at ten. It's hell for h...",TÍTULO:\nI was Diagnosed With Crohns Disease a...
4,dhgot,CrohnsDisease,unknownpleasures,Has anyone here had to have bowel surgery more...,I had a small bowel resection and appendectomy...,2010-09-22 18:31:36,"[""I have a story:\n\nI had a small bowel resec...",TÍTULO:\nHas anyone here had to have bowel sur...


In [None]:
text_cols = ["title", "selftext", "comments"]
processed_df = preprocess_dataframe(df, text_cols)

→ Preprocessing column: title


100%|██████████| 183744/183744 [00:05<00:00, 34782.04it/s]
100%|██████████| 183744/183744 [06:10<00:00, 496.16it/s]


→ Preprocessing column: selftext


100%|██████████| 183744/183744 [01:09<00:00, 2630.62it/s]
100%|██████████| 183744/183744 [17:45<00:00, 172.37it/s]


→ Preprocessing column: comments


100%|██████████| 183744/183744 [05:12<00:00, 588.45it/s]  
100%|██████████| 183744/183744 [1:01:00<00:00, 50.19it/s] 


OSError: Cannot save file into a non-existent directory: '../data/processed'

In [8]:
processed_df.to_csv("../data/processed/reddit_ibd_preprocessed.csv", index=False)