In [122]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import word2vec
from xgboost import XGBClassifier
from joblib import Memory


from collections import Counter
import itertools
import os
import string

nltk.download('stopwords')
nltk.download('punkt')  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danny\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [91]:
class CustomEmbeddingVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary, embedding_weights, size_features=100):
        self.vocabulary = vocabulary
        self.embedding_weights = embedding_weights
        self.size_features = size_features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = []
        for sent in X:
            vecs = [self.embedding_weights[self.vocabulary[word]]
                    for word in sent if word in self.vocabulary]
            if vecs:
                features.append(np.mean(vecs, axis=0))
            else:
                features.append(np.zeros(self.size_features))
        return np.array(features)

In [92]:
def get_embeddings(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=5):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15
    downsampling = 1e-3 
    print('Training Word2Vec model...')
    # use inp_data and vocabulary_inv to reconstruct sentences
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [93]:
def build_vocab(sentences):
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [94]:
def decode_bytes(val):
    if isinstance(val, bytes):
        try:
            return val.decode('utf-8').strip("ub'\"")  # remove u'', b'', extra quotes
        except:
            return val
    return val

In [95]:
def preprocess_df(df):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1]
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [96]:
data_path = "/Users/danny/OneDrive/Documents/UCSD/DSC 258R/kaggle_proj/"

# 1. Load training dataset (with labels)
train_df = pd.read_csv(data_path + "train.csv")
test_df = pd.read_csv(data_path + "test.csv")
train_df["text"] = train_df["review"]
test_df["text"] = test_df["review"]

train_df = preprocess_df(train_df)  # Applies stopword removal & punctuation cleanup
test_df = preprocess_df(test_df)

tagged_train_data = [word_tokenize(sent) for sent in train_df["text"]]
tagged_test_data = [word_tokenize(sent) for sent in test_df["text"]]

train_df["tokens"] = tagged_train_data
test_df["tokens"] = tagged_test_data

word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_train_data)
inp_data = [[vocabulary[word] for word in sent if word in vocabulary] for sent in tagged_train_data]
embedding_weights = get_embeddings(inp_data, vocabulary_inv, size_features=100)

Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding


In [97]:
valid_columns = train_df.columns[train_df.count() >= 10000]

# Filter all datasets to keep only those columns
train_df = train_df[valid_columns]

cols_to_drop = ['id', 'hours.Tuesday', 'hours.Saturday', 'hours.Friday', 'attributes.WiFi', 'postal_code', 'hours.Thursday', 'text', 'review', 'address', 'hours.Wednesday', 'hours', 'business_id', 'attributes']

train_df = train_df.drop(columns=cols_to_drop, errors='ignore')
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')

train_df = train_df.applymap(decode_bytes)
test_df = test_df.applymap(decode_bytes)

non_numeric_cols = train_df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.difference(['label']).tolist()

cols_to_exclude = ['label', 'tokens']  # add more if needed
categorical_cols = [col for col in non_numeric_cols if col not in cols_to_exclude]

print(f"Training data loaded with {train_df.shape[0]} rows and {train_df.shape[1]} columns")
print(f"Test data loaded with {test_df.shape[0]} rows and {test_df.shape[1]} columns")

# 3. Verify the training data has the label column
target_column = 'label'

print(f"\nTarget column: {target_column}")
print("Target value distribution:")
print(train_df[target_column].value_counts())

class_counts = train_df['label'].value_counts()

print(class_counts)

print(train_df.columns.tolist())

Training data loaded with 13144 rows and 23 columns
Test data loaded with 10000 rows and 49 columns

Target column: label
Target value distribution:
label
american (traditional)    2680
mexican                   2217
italian                   2032
chinese                   1696
american (new)            1399
japanese                  1063
mediterranean              728
canadian (new)             484
thai                       483
asian fusion               362
Name: count, dtype: int64
label
american (traditional)    2680
mexican                   2217
italian                   2032
chinese                   1696
american (new)            1399
japanese                  1063
mediterranean              728
canadian (new)             484
thai                       483
asian fusion               362
Name: count, dtype: int64
['attributes.Ambience', 'attributes.OutdoorSeating', 'longitude', 'name', 'attributes.RestaurantsReservations', 'attributes.RestaurantsPriceRange2', 'attributes.NoiseL

  train_df = train_df.applymap(decode_bytes)
  test_df = test_df.applymap(decode_bytes)


In [98]:
train_df.head()

Unnamed: 0,attributes.Ambience,attributes.OutdoorSeating,longitude,name,attributes.RestaurantsReservations,attributes.RestaurantsPriceRange2,attributes.NoiseLevel,state,attributes.Alcohol,attributes.HasTV,...,is_open,city,stars,attributes.RestaurantsTakeOut,latitude,attributes.RestaurantsAttire,attributes.RestaurantsDelivery,attributes.GoodForKids,label,tokens
0,"b""{'romantic': False, 'intimate': False, 'clas...",b'False',-81.820568,b'Rush Inn',b'False',b'2',"b""u'loud'""",b'OH',"b""u'full_bar'""",b'True',...,1,b'Lakewood',4.0,b'True',41.484197,"b""u'casual'""",b'False',b'False',american (traditional),"[So, stopped, way, Side, Quest, street, nWe, k..."
1,"b""{'romantic': False, 'intimate': False, 'tour...",b'True',-112.032893,b'GreenMix',b'False',b'2',"b""u'quiet'""",b'AZ',"b""u'none'""",b'False',...,1,b'Phoenix',3.5,b'True',33.379283,"b""u'casual'""",b'False',b'True',american (new),"[This, go, healthy, spot, The, food, always, f..."
2,,,-79.339163,b'BarBurrito - Gerrard',b'False',,,b'ON',,,...,1,b'Toronto',3.0,b'False',43.669144,,,,mexican,"[Food, court, meal, Gerrard, Square, It, since..."
3,"b""{'romantic': False, 'intimate': False, 'clas...",b'False',-115.242714,b'SalvaMex',b'False',b'1',"b""'quiet'""",b'NV',"b""'none'""",b'True',...,0,b'Las Vegas',4.0,b'True',36.159527,"b""'casual'""",b'True',b'True',mexican,"[Located, Rainbow, Charleston, small, family, ..."
4,"b""{'romantic': False, 'intimate': False, 'tour...",b'False',-81.726357,b'Hop Hing',b'False',b'1',,b'OH',"b""u'none'""",b'False',...,1,b'North Royalton',3.5,b'True',41.330546,"b""u'casual'""",b'False',b'True',chinese,"[No, frills, Chinese, takeout, joint, serves, ..."


In [99]:
text_pipeline = Pipeline([
    ('embed', CustomEmbeddingVectorizer(vocabulary, embedding_weights))
])

categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('text', text_pipeline, 'tokens'),
    ('cat', categorical_pipeline, categorical_cols),
    ('num', numeric_pipeline, numeric_cols)
])

In [None]:
# Logistic Regression Model with TF-IDF

X_full = train_df[['tokens'] + categorical_cols + numeric_cols]
X_test = test_df[['tokens'] + categorical_cols + numeric_cols]

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["label"])
y_full = label_encoder.fit_transform(train_df["label"])

X_train_text, X_val_text, y_train, y_val = train_test_split(
    train_df['tokens'].apply(lambda x: ' '.join(x)),  # join tokens into text
    train_df['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(ngram_range=(1,2), max_df=0.75, min_df=3)

# Define pipeline
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced'))
])


# TF-IDF param grid
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [1, 3, 5],
    'tfidf__max_df': [0.75, 0.9, 1.0],
    'tfidf__max_features': [None, 10000, 20000],
    'clf__C': [0.1, 1, 10]
}



grid = GridSearchCV(text_clf, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
grid.fit(X_train_text, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV score:", grid.best_score_)

# Fit and evaluate
# text_clf.fit(X_train_text, y_train)
# val_preds = text_clf.predict(X_val_text)
# acc = accuracy_score(y_val, val_preds)

# print(f"Validation accuracy: {acc:.4f}")


Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters: {'clf__C': 10, 'tfidf__max_df': 0.75, 'tfidf__max_features': None, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}
Best CV score: 0.8073228720874941


In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df=0.75, min_df=3)

X_full = train_df[['tokens'] + categorical_cols + numeric_cols]
X_test = test_df[['tokens'] + categorical_cols + numeric_cols]

label_encoder = LabelEncoder()
#y_train = label_encoder.fit_transform(train_df["label"])
y_full = label_encoder.fit_transform(train_df["label"])

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
)

xgb = XGBClassifier(
    objective='multi:softmax',  # or 'multi:prob' if you want probabilities
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)

cache_dir = './cache_dir'
memory = Memory(cache_dir, verbose=0)

pipe = Pipeline([
    ('tfidf', tfidf),  # fixed from Step 1
    ('clf', xgb)
], memory=memory)

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [4, 6],
    'clf__learning_rate': [0.05, 0.1, 0.2],
    'clf__subsample': [0.8, 1.0],
    'clf__colsample_bytree': [1.0]
}

grid = GridSearchCV(pipe, param_grid, cv=3, verbose=2, n_jobs=-1, error_score='raise')
grid.fit(X_train_text, y_train)

print("Best score:", grid.best_score_)
print("Best params:", grid.best_params_)
best_model = grid.best_estimator_

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [None]:
# LightGBM with TF-IDF Vectorizer

from lightgbm import LGBMClassifier

tfidf = TfidfVectorizer(ngram_range=(1,2), max_df=0.75, min_df=3)

X_full = train_df[['tokens'] + categorical_cols + numeric_cols]
X_test = test_df[['tokens'] + categorical_cols + numeric_cols]

label_encoder = LabelEncoder()
#y_train = label_encoder.fit_transform(train_df["label"])
y_full = label_encoder.fit_transform(train_df["label"])

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
)

cache_dir = './cache_dir'
memory = Memory(cache_dir, verbose=0)

lgb = LGBMClassifier(objective='multiclass', random_state=42, n_jobs=-1)

pipe = Pipeline([
    ('tfidf', tfidf),
    ('clf', lgb)
], memory=memory)

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__num_leaves': [31, 63],
    'clf__learning_rate': [0.05, 0.1, 0.2],
}

grid = GridSearchCV(pipe, param_grid, cv=3, verbose=2, n_jobs=-1, error_score='raise')
grid.fit(X_train_text, y_train)

print("Best score:", grid.best_score_)
print("Best params:", grid.best_params_)
best_model = grid.best_estimator_

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_pipeline, 'tokens'),
        ('cat', categorical_pipeline, categorical_cols),
        ('num', numeric_pipeline, numeric_cols)
    ],
    remainder='drop'  # drop any other columns not listed
)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=5000, solver='liblinear'))
])

model_pipeline.fit(train_df, train_df['label'])

# Predict
preds = model_pipeline.predict(test_df)

In [112]:
X_full_text = train_df['tokens'].apply(lambda x: ' '.join(x))
text_clf.fit(X_full_text, y_full)

# Predict on test data
X_test_text = test_df['tokens'].apply(lambda x: ' '.join(x))
y_test_pred_encoded = text_clf.predict(X_test_text)
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# Export to CSV
output_path = "predicted.csv"
pd.DataFrame({
    "Id": range(len(y_test_pred)),
    "Predicted": y_test_pred
}).to_csv(output_path, index=False)

print(f"\n✅ Predictions exported to {output_path}")


✅ Predictions exported to predicted.csv


In [None]:
X_full = train_df[['tokens'] + categorical_cols + numeric_cols]
X_test = test_df[['tokens'] + categorical_cols + numeric_cols]

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["label"])

X_train, X_val, y_train_split, y_val_split = train_test_split(
    X_full, y_train, test_size=0.2, random_state=42, stratify=y_train
)

log_clf = LogisticRegression(solver='liblinear', max_iter=1000)
rf_clf = RandomForestClassifier(random_state=42)
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rf_clf),
        ('xgb', xgb_clf)
    ],
    voting='soft'
)

model_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', voting_clf)
])

# Define hyperparameter grid
param_grid = {
    'classifier__lr__C': [0.1, 1, 10],
    'classifier__rf__n_estimators': [50, 10, 200],
    'classifier__rf__max_depth': [5, 10],
    'classifier__xgb__max_depth': [3, 5, 7],
    'classifier__xgb__n_estimators': [50, 100, 200],
    'classifier__xgb__learning_rate': [0.01, 0.1, 0.3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV setup
grid_search = GridSearchCV(
    model_pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    error_score='raise'
)

# # RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=model_pipeline,
#     param_distributions=param_grid,  # same param grid but treated as distributions
#     n_iter=30,                       # number of random parameter settings to try (adjust as needed)
#     cv=cv,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=1,
#     error_score='raise',
#     random_state=42                  # for reproducibility
# )

# Run grid search
grid_search.fit(X_train, y_train_split)

# Best model and score
print("Best Params:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Predict on validation set using the best estimator
y_val_pred = grid_search.best_estimator_.predict(X_val)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Params: {'classifier__lr__C': 10, 'classifier__rf__max_depth': 5, 'classifier__rf__n_estimators': 200, 'classifier__xgb__learning_rate': 0.3, 'classifier__xgb__max_depth': 3, 'classifier__xgb__n_estimators': 200}
Best Accuracy: 0.7943889681407514




In [73]:
print("Training model...")
model_pipeline.fit(X_train, y_train_split)

y_val_pred = model_pipeline.predict(X_val)
print("\nValidation Report:")
print(classification_report(y_val_split, y_val_pred, target_names=label_encoder.classes_))
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred))

Training model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Validation Report:
                        precision    recall  f1-score   support

        american (new)       0.58      0.40      0.48       280
american (traditional)       0.66      0.82      0.73       536
          asian fusion       0.26      0.07      0.11        72
        canadian (new)       0.40      0.35      0.37        97
               chinese       0.80      0.91      0.85       339
               italian       0.83      0.90      0.86       406
              japanese       0.84      0.82      0.83       213
         mediterranean       0.90      0.71      0.80       146
               mexican       0.96      0.94      0.95       443
                  thai       0.88      0.75      0.81        97

              accuracy                           0.77      2629
             macro avg       0.71      0.67      0.68      2629
          weighted avg       0.76      0.77      0.76      2629

Validation Accuracy: 0.7717763408139977




In [74]:
model_pipeline.fit(X_full, y_train)
y_test_pred_encoded = model_pipeline.predict(X_test)
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [75]:
output_path = "predicted.csv"
pd.DataFrame({
    "Id": range(len(y_test_pred)),
    "Predicted": y_test_pred
}).to_csv(output_path, index=False)

print(f"\n✅ Predictions exported to {output_path}")


✅ Predictions exported to predicted.csv
