In [None]:
# Cài thư viện cần thiết
!pip install -q kaggle pandas numpy scikit-learn nltk spacy transformers datasets imbalanced-learn joblib

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'amazon-fine-food-reviews' dataset.
Path to dataset files: /kaggle/input/amazon-fine-food-reviews


In [None]:
# Cell A1: kiểm tra file có trong Colab không
import os
path = '/kaggle/input/amazon-fine-food-reviews'
print(os.path.exists(path))
print("Các file trong thư mục:")
print(os.listdir(path))

True
Các file trong thư mục:
['hashes.txt', 'Reviews.csv', 'database.sqlite']


**Load và EDA nhanh**

In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv')  # điều chỉnh path nếu cần
df.shape, df.columns.tolist()
# xem vài dòng
df.head()
# thống kê score
df['Score'].value_counts()
# kiểm tra thời gian range
df['Time'] = pd.to_datetime(df['Time'], unit='s')  # dataset dùng unix timestamp
df['Time'].min(), df['Time'].max()

(Timestamp('1999-10-08 00:00:00'), Timestamp('2012-10-26 00:00:00'))

**Kiểm tra duplicate & helpfulness**

In [None]:
# duplicates (bằng text + product)
dups = df.duplicated(subset=['Text','ProductId'])
df[dups].shape

# Helpfulness sanity check
invalid_help = df[df['HelpfulnessNumerator'] > df['HelpfulnessDenominator']]
invalid_help.shape, invalid_help.head()
# Lọc các bản ghi invalid
df = df[df['HelpfulnessNumerator'] <= df['HelpfulnessDenominator']]

**Loại duplicate, drop missing text**

In [None]:
# loại duplicate (giữ first)
df = df.drop_duplicates(subset=['Text','ProductId','UserId'])
# drop null text
df = df[df['Text'].notna() & (df['Text'].str.strip() != '')]
df = df.reset_index(drop=True)
df.shape

(567143, 10)

**Gắn nhãn sentiment (2- or 3-class)**

In [None]:
# Option 1: 2-class (Positive: 4-5, Negative: 1-2) — bỏ 3
df2 = df[df['Score'] != 3].copy()
df2['sentiment'] = df2['Score'].apply(lambda x: 'positive' if x>=4 else 'negative')

# Option 2: 3-class (1-2 negative, 3 neutral, 4-5 positive)
# df['sentiment'] = df['Score'].apply(lambda x: 'positive' if x>=4 else ('neutral' if x==3 else 'negative'))

**Text preprocessing (cơ bản)**

In [None]:
# Basic text cleaning
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))

def clean_text(text):
    text = str(text)
    text = re.sub(r'<.*?>', ' ', text)            # remove HTML
    text = re.sub(r'http\S+', ' ', text)          # remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)   # remove punctuations
    text = text.lower()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords_set and len(t)>1]
    return ' '.join(tokens)

df2['clean_text'] = df2['Text'].apply(clean_text)
df2[['Text','clean_text']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Text,clean_text
0,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...
2,This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


**Train-test split (stratify)**

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df2, test_size=0.2, stratify=df2['sentiment'], random_state=42)
len(train_df), len(test_df)

(419668, 104917)

**Baseline: TF-IDF + Logistic Regression**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))  # class_weight helps imbalance
])

pipeline.fit(train_df['clean_text'], train_df['sentiment'])
pred = pipeline.predict(test_df['clean_text'])
print(classification_report(test_df['sentiment'], pred))

              precision    recall  f1-score   support

    negative       0.71      0.92      0.80     16344
    positive       0.98      0.93      0.96     88573

    accuracy                           0.93    104917
   macro avg       0.85      0.93      0.88    104917
weighted avg       0.94      0.93      0.93    104917



**Điều chỉnh imbalance**

In [None]:
# Ví dụ: dùng oversampling với imblearn (chỉ với vectorized features)
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

**Lưu Model**

In [None]:
import joblib
joblib.dump(pipeline, '/content/sentiment_pipeline_lr.joblib')

# Tạo file predictions
test_df = test_df.copy()
test_df['predicted'] = pipeline.predict(test_df['clean_text'])
test_df['pred_proba_pos'] = pipeline.predict_proba(test_df['clean_text'])[:, pipeline.classes_.tolist().index('positive')]

out_cols = ['Id','ProductId','UserId','Score','Time','Summary','Text','clean_text','sentiment','predicted','pred_proba_pos']
test_df[out_cols].to_csv('/content/sentiment_predictions_sample.csv', index=False)
print("Saved /content/sentiment_predictions_sample.csv")

Saved /content/sentiment_predictions_sample.csv


In [None]:
from google.colab import files
files.download('/content/sentiment_predictions_sample.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>