In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# Load the dataset
df = pd.read_parquet("hf://datasets/jniimi/tripadvisor-review-rating/data/train-00000-of-00001.parquet")

# Clean the data
df_clean = df.dropna(subset=['review'])
df_clean = df_clean[df_clean['review'].str.strip() != ""]
df_clean = df_clean.drop_duplicates(subset=['review']).reset_index(drop=True)
print("Number of reviews after cleaning:", df_clean.shape[0])

# Overall rating is used as the target
target = df_clean['overall']
reviews = df_clean['review']

Number of reviews after cleaning: 201278


In [5]:
# Split dataset into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(reviews, target, test_size=0.2, random_state=42)

# Initialize TfidfVectorizer and fit on training data, then transform both train and test data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Train Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_tfidf)

In [7]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.6791534181240063
Classification Report:
              precision    recall  f1-score   support

         1.0       0.70      0.64      0.67      1667
         2.0       0.47      0.34      0.39      2115
         3.0       0.58      0.55      0.57      5621
         4.0       0.61      0.62      0.62     13443
         5.0       0.77      0.81      0.79     17410

    accuracy                           0.68     40256
   macro avg       0.63      0.59      0.61     40256
weighted avg       0.67      0.68      0.68     40256

