#### Import needed libraries

In [1]:
import re
import pandas as pd
import numpy as np
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#### Download NLTK resources (only needed first time)

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/danielmasamba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/danielmasamba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/danielmasamba/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

#### Define custom text preprocessor & tokenizer
We will: 
* Remove HTML-tag and any residual markup
* Lowercase all text
* Remove punctuation and digits
* Tokenize on whitespace or with a regex tokenizer
* Remove stop-word using NLTK standard English list
* Porter Stemming to reduce terms to root forms

In [3]:
stop_words = set(stopwords.words('english'))
stemmer    = PorterStemmer()

def custom_preprocessor(text: str) -> str:
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', ' ', text)
    # Remove punctuation (keep only word chars and whitespace)
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

def custom_tokenizer(text: str) -> list[str]:
    # Basic word tokenization
    tokens = nltk.word_tokenize(text)
    # Stop-word removal
    tokens = [t for t in tokens if t not in stop_words]
    # Stemming
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens

#### Load and view the data

In [4]:
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')
X_test  = pd.read_csv('data/X_test.csv')

In [5]:
X_train

Unnamed: 0,coffee_id,roaster,roast,origin,100g_USD,review
0,1014,Kakalove Cafe,Medium-Light,Taiwan,11.33,"Delicate, fruit-forward. Blueberry, molasses, ..."
1,1000,Port of Mokha,,Yemen,39.68,"Deep yet soaring, vertically complex. Dried bl..."
2,1094,Simon Hsieh Aroma Roast Coffees,Medium,Kenya,12.04,"Evaluated as espresso. Richly chocolaty, compl..."
3,142,JBC Coffee Roasters,Medium-Light,Ethiopia,5.51,"Floral, bright, citrusy, balanced. Star jasmin..."
4,647,Roast House,Medium-Light,Ethiopia,4.19,"Delicate, sweetly spice-toned. Pink peppercorn..."
...,...,...,...,...,...,...
615,898,Green Stone Coffee,Medium-Light,Kenya,9.24,"Rich-toned, deeply aromatic. Black currant, to..."
616,384,Home in Harmony,Medium-Light,Ethiopia,4.93,"Cleanly fruit-toned, delicately sweet-tart. Ra..."
617,588,David's Nose,Light,Ethiopia,5.66,"Richly aromatic, sweetly tart. Boysenberry, st..."
618,1180,El Gran Cafe,Medium-Light,Guatemala,5.88,"Evaluated as espresso. Multi-layered, complex...."


#### Drop coffee_id column but keep aside to use for output

In [6]:
train_ids = X_train.pop('coffee_id')
test_ids  = X_test.pop('coffee_id')

In [7]:
train_ids

0      1014
1      1000
2      1094
3       142
4       647
       ... 
615     898
616     384
617     588
618    1180
619     200
Name: coffee_id, Length: 620, dtype: int64

#### Build sub-pipelines for attributes

In [8]:
# Numeric pipeline for 100g-USD
num_pipe = Pipeline([
    # log(1+x) to reduce skew
    ('log',   FunctionTransformer(np.log1p, validate=True)),
    # standardize
    ('scale', StandardScaler()),
])

# Categorical pipeline for roaster, roast, origin
cat_pipe = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)

# Text pipeline for review
txt_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        preprocessor=custom_preprocessor,
        tokenizer=custom_tokenizer,
        stop_words=None,        # we handle stop‐words in tokenizer
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.95,
        max_features=1000
    )),
    ('select', SelectKBest(chi2, k=500)),
])

#### Combine everything into a ColumnTransformer

In [9]:
preprocessor = ColumnTransformer([
    ('num', num_pipe,   ['100g_USD']),
    ('cat', cat_pipe,   ['roaster', 'roast', 'origin']),
    ('txt', txt_pipe,   'review'),
], remainder='drop', verbose_feature_names_out=False)

#### Fit the preprocessing on the training data, transform both train & test

In [10]:
X_train_transformed = preprocessor.fit_transform(X_train, y_train.values.ravel())
X_test_transformed  = preprocessor.transform(X_test)



#### Inspect outputs

In [11]:
print("Transformed X_train shape:", X_train_transformed.shape)
print("Transformed X_test  shape:", X_test_transformed.shape)
print("#"*60)
print(X_train_transformed)


Transformed X_train shape: (620, 728)
Transformed X_test  shape: (267, 728)
############################################################
[[ 0.43423367  0.          0.         ...  0.          0.
   0.        ]
 [ 2.215536    0.          0.         ...  0.          0.
   0.18672739]
 [ 0.51777921  0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.48486757  0.          0.         ...  0.          0.
   0.        ]
 [-0.43637064  0.          0.         ...  0.          0.
   0.        ]
 [-0.94586458  0.          0.         ...  0.          0.
   0.        ]]
