In [58]:
#imports
import pickle
import pandas as pd
import itertools
from collections import Counter
import numpy as np
from gensim.models import Word2Vec


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score


import nltk
from nltk.corpus import stopwords, wordnet
nltk.download('punkt') 
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import word2vec

import string


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


  
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
/kaggle/input/ucsd-dsc-258r-nlp/train.csv
/kaggle/input/ucsd-dsc-258r-nlp/test.csv
/kaggle/input/ucsd-dsc-258r-nlp/baseline.ipynb


In [59]:
# A function used to build a vocabulary based on descending word frequencies 
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [60]:
def get_embeddings(inp_data, vocabulary_inv, size_features=100, mode='skipgram', min_word_count=2, context=5):
    model_name = "embedding"
    num_workers = 15
    downsampling = 1e-3
    print('Training Word2Vec model...')
    sentences = [[vocabulary_inv[w] for w in s if w != -1] for s in inp_data]
    sg = 1 if mode == 'skipgram' else 0
    embedding_model = Word2Vec(sentences, workers=num_workers, sg=sg, vector_size=size_features, min_count=min_word_count, window=context, sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
    return embedding_model

In [61]:
def preprocess_df(df):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [62]:
df_train = pd.read_csv("/kaggle/input/ucsd-dsc-258r-nlp/train.csv")
df_test = pd.read_csv("/kaggle/input/ucsd-dsc-258r-nlp/test.csv")

df_train["text"] = df_train["review"]
df_test["text"] = df_test["review"]

df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)



In [63]:
#tokenize
df_train['tokens'] = df_train['text'].apply(word_tokenize)
df_test['tokens'] = df_test['text'].apply(word_tokenize)

#build vocabulary
sentences_train = df_train['tokens'].tolist()
sentences_test = df_test['tokens'].tolist()
word_counts, vocabulary, vocabulary_inv = build_vocab(sentences_train + sentences_test)



In [64]:
# prepare data
train_inp_data = [[vocabulary.get(word, -1) for word in tokens] for tokens in sentences_train]
test_inp_data = [[vocabulary.get(word, -1) for word in tokens] for tokens in sentences_test]

#embeddings
embedding_model = get_embeddings(train_inp_data + test_inp_data, vocabulary_inv)


Training Word2Vec model...
Saving Word2Vec model embedding


In [65]:
def average_embeddings(embedding_model, inp_data, vocabulary_inv):
    avg_embeddings = []
    for tokens in inp_data:
        vectors = [embedding_model.wv[vocabulary_inv[token]] for token in tokens if vocabulary_inv[token] in embedding_model.wv]
        if vectors:
            avg_embedding = np.mean(vectors, axis=0)
        else:
            avg_embedding = np.zeros(embedding_model.vector_size)
        avg_embeddings.append(avg_embedding)
    return np.array(avg_embeddings)


# Average embeddings
train_embeddings_avg = average_embeddings(embedding_model, train_inp_data, vocabulary_inv)
test_embeddings_avg = average_embeddings(embedding_model, test_inp_data, vocabulary_inv)

#convert to sparse matrix
embedding_train_dense = train_embeddings_avg
embedding_test_dense = test_embeddings_avg

In [66]:
#Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy.sparse import hstack, csr_matrix

sid = SentimentIntensityAnalyzer()
def get_sentiment(text):
    return sid.polarity_scores(text)['compound']

# Get polarity of text
df_train['sentiment'] = df_train['text'].apply(get_sentiment)
df_test['sentiment'] = df_test['text'].apply(get_sentiment)

#convert to numpy array and reshape
train_sentiment = np.array(df_train['sentiment'], dtype=np.float64).reshape(-1, 1)
test_sentiment = np.array(df_test['sentiment'], dtype=np.float64).reshape(-1, 1)

train_sentiment_dense = train_sentiment.reshape(-1, 1)
test_sentiment_dense = test_sentiment.reshape(-1, 1)


In [67]:
#TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
tfidf_train = tfidf_vectorizer.fit_transform(df_train['text'])
tfidf_test = tfidf_vectorizer.transform(df_test['text'])



In [68]:
#combine
features_train = np.hstack([tfidf_train.toarray(), train_sentiment_dense, embedding_train_dense])
features_test = np.hstack([tfidf_test.toarray(), test_sentiment_dense, embedding_test_dense])

print("Shape of combined features_train:", features_train.shape)
print("Shape of combined features_test:", features_test.shape)


Shape of combined features_train: (13144, 5101)
Shape of combined features_test: (10000, 5101)


In [69]:
#test train val split
X_train, X_val, y_train, y_val = train_test_split(features_train, df_train['label'], test_size=0.2, random_state=42)


In [70]:
from scipy.sparse import hstack

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
val_predictions = model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, val_predictions)
f1 = f1_score(y_val, val_predictions, average='weighted')

print("Validation Accuracy:", accuracy)
print("Validation F1 Score:", f1)


Validation Accuracy: 0.798022061620388
Validation F1 Score: 0.7833301913121313


In [71]:
# Final predictions
test_predictions = model.predict(features_test)

In [72]:
#submission file
submission_df = pd.DataFrame({
    'Id': df_test['id'],  # Assuming 'id' is the column name for IDs in the test dataset
    'Predicted': test_predictions
})
submission_df.to_csv("predicted.csv", index=False)


In [73]:
#See predictions
print("Accuracy on Validation Set:", accuracy_score(y_val, val_predictions))
print("Classification Report:\n", classification_report(y_val, val_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_predictions))



Accuracy on Validation Set: 0.798022061620388
Classification Report:
                         precision    recall  f1-score   support

        american (new)       0.58      0.40      0.47       281
american (traditional)       0.63      0.85      0.72       507
          asian fusion       0.70      0.17      0.28        81
        canadian (new)       0.43      0.20      0.27        99
               chinese       0.88      0.94      0.91       327
               italian       0.87      0.91      0.89       456
              japanese       0.86      0.90      0.88       197
         mediterranean       0.94      0.81      0.87       144
               mexican       0.97      0.95      0.96       437
                  thai       0.91      0.91      0.91       100

              accuracy                           0.80      2629
             macro avg       0.78      0.70      0.72      2629
          weighted avg       0.79      0.80      0.78      2629

Confusion Matrix:
 [[111 139   