#### Import needed libraries

In [None]:
import re
import pandas as pd
import numpy as np
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#### Download NLTK resources (only needed first time)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

#### Define custom text preprocessor & tokenizer
We will: 
* Remove HTML-tag and any residual markup
* Lowercase all text
* Remove punctuation and digits
* Tokenize on whitespace or with a regex tokenizer
* Remove stop-word using NLTK standard English list
* Porter Stemming to reduce terms to root forms

In [None]:
stop_words = set(stopwords.words('english'))
stemmer    = PorterStemmer()

def custom_preprocessor(text: str) -> str:
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', ' ', text)
    # Remove punctuation (keep only word chars and whitespace)
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

def custom_tokenizer(text: str) -> list[str]:
    # Basic word tokenization
    tokens = nltk.word_tokenize(text)
    # Stop-word removal
    tokens = [t for t in tokens if t not in stop_words]
    # Stemming
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens

#### Load and view the data

In [None]:
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')
X_test  = pd.read_csv('data/X_test.csv')

In [None]:
X_train

#### Drop coffee_id column but keep aside if needed for output

In [None]:
train_ids = X_train.pop('coffee_id')
test_ids  = X_test.pop('coffee_id')

#### Build sub-pipelines for attributes

In [None]:
# Numeric pipeline for 100g-USD
num_pipe = Pipeline([
    # log(1+x) to reduce skew
    ('log',   FunctionTransformer(np.log1p, validate=True)),
    # standardize
    ('scale', StandardScaler()),
])

# Categorical pipeline for roaster, roast, origin
cat_pipe = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)

# Text pipeline for review
txt_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        preprocessor=custom_preprocessor,
        tokenizer=custom_tokenizer,
        token_pattern=None,
        stop_words=None,        # we handle stop‐words in tokenizer
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.95,
        max_features=1000
    )),
    ('select', SelectKBest(chi2, k=500)),
])

#### Combine everything into a ColumnTransformer

In [None]:
preprocessor = ColumnTransformer([
    ('num', num_pipe,   ['100g_USD']),
    ('cat', cat_pipe,   ['roaster', 'roast', 'origin']),
    ('txt', txt_pipe,   'review'),
], remainder='drop', verbose_feature_names_out=False)

#### Fit the preprocessing on the training data, transform both train & test

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train, y_train.values.ravel())
X_test_transformed  = preprocessor.transform(X_test)

#### Inspect outputs

In [None]:
print("Transformed X_train shape:", X_train_transformed.shape)
print("Transformed X_test  shape:", X_test_transformed.shape)
