In [None]:
normalisasi dan penghapusan stopwords

# Import library yang dibutuhkan
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from google.colab import files

# Download stopwords jika belum ada
nltk.download('stopwords')

    # Fungsi untuk normalisasi teks
    def normalize_text(text):
 text = text.lower()  # Ubah menjadi huruf kecil
text = re.sub(r'\d+', '', text)  # Hapus angka
text = re.sub(r'[^a-z\s]', '', text)  # Hapus tanda baca dan karakter spesial, kecuali spasi
text = re.sub(r'\s+', ' ', text)  # Hapus spasi berlebih
return text.strip()  # Hapus spasi di awal dan akhir

    # Fungsi untuk menghapus stop words
def remove_stop_words(text):
stop_words = set(stopwords.words('english'))  # Daftar stop words
words = text.split()  # Memisahkan teks menjadi kata-kata
filtered_words = [word for word in words if word not in stop_words]  # Hapus stop words
return ' '.join(filtered_words)  # Gabungkan kembali kata-kata dengan spasi

# Upload file
uploaded = files.upload()

# Baca file CSV (menggunakan nama file yang diunggah)
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)

# Pastikan kolom "reviewText" ada
data = data.dropna(subset=['reviewText'])  # Drop baris yang kolom reviewText-nya kosong
    # Normalisasi dan hapus stop words pada kolom "reviewText"
data['reviewText_normalized'] = data['reviewText'].apply(normalize_text)
data['reviewText_cleaned'] = data['reviewText_normalized'].apply(remove_stop_words)
# Simpan hasil ke file baru
output_file = 'Processed_Reviews.csv'
data.to_csv(output_file, index=False)
# Unduh file hasil
files.download(output_file)

print("Proses selesai! File hasil telah disimpan dan dapat diunduh.")

Tokenisasi

# Import pustaka yang diperlukan
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from google.colab import files

# Download 'punkt_tab' before calling word_tokenize
nltk.download('punkt_tab')

nltk.download('punkt')

# Unggah file
datafile = files.upload()

# Ambil nama file yang diunggah
dataset_path = list(datafile.keys())[0]

# Baca dataset
data = pd.read_csv(dataset_path)

# Tampilkan kolom yang tersedia
print("Kolom yang tersedia:", data.columns)

# Periksa apakah kolom 'reviewText_cleaned' tersedia
if 'reviewText_cleaned' in data.columns:
    # Ambil kolom cleaned_review
    reviews = data['reviewText_cleaned']

    # Tokenisasi
    data['tokenized_review'] = reviews.dropna().apply(lambda x: word_tokenize(str(x)))

    # Simpan hasil tokenisasi ke file baru
    output_path = 'tokenized_reviews.csv'
    data.to_csv(output_path, index=False)

    print(f"Tokenisasi selesai. Hasil disimpan di: {output_path}")
else:
    print("Kolom '' tidak ditemukan dalam dataset.")

Bag of words dan TF-IDF

# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Membuat dataset manual
data = pd.DataFrame({
    'reviewText_cleaned': [
        "handcream beautiful fragrance doesnt stay protect hands washing size quite small",
        "wonderful hand lotion seriously dry skin stays long time little goes long long way go easy wonderful scent maybe bit strong first dissipates",
        "best hand cream around silky thick soaks way leaving hands super soft",
        "thanks",
        "great hand lotion soaks right leaves skin super soft greasy residue great scent",
        "great product doesnt leave hands feeling greasy slippery"
    ]
})

# Menampilkan beberapa baris pertama
print("\nDataset Loaded:")
print(data.head())

# Menginisialisasi CountVectorizer
vectorizer = CountVectorizer()

# Membuat Bag of Words
X = vectorizer.fit_transform(data['reviewText_cleaned'].fillna(''))

# Mengonversi hasil ke DataFrame
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Menampilkan hasil Bag of Words
print("\nBag of Words (first 10 features):")
print(bow_df.iloc[:, :10].head())

# Jika ingin menampilkan dimensi hasil
print("\nShape of Bag of Words:", bow_df.shape)

# Menginisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Membuat TF-IDF
X_tfidf = tfidf_vectorizer.fit_transform(data['reviewText_cleaned'].fillna(''))

# Mengonversi hasil ke DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Menampilkan hasil TF-IDF
print("\nTF-IDF (first 10 features):")
print(tfidf_df.iloc[:, :10].head())

# Jika ingin menampilkan dimensi hasil
print("\nShape of TF-IDF:", tfidf_df.shape)

Metrik Evaluasi

 # Import library yang dibutuhkan
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from google.colab import files

# Langkah 1: Upload file CSV
uploaded = files.upload()

# Mengambil nama file yang diunggah
dataset_path = list(uploaded.keys())[0]
data = pd.read_csv(dataset_path)

# Langkah 2: Memeriksa kolom yang tersedia
print("Kolom yang tersedia dalam dataset:", data.columns)

# Pastikan kolom 'reviewText_cleaned' ada
if 'reviewText_cleaned' not in data.columns:
    raise ValueError("Kolom 'reviewText_cleaned' tidak ditemukan dalam dataset.")

# Langkah 3: Menangani data NaN pada kolom 'reviewText_cleaned'
data['reviewText_cleaned'] = data['reviewText_cleaned'].fillna('')

# Langkah 4: Membuat label dummy berdasarkan panjang ulasan
data['label'] = data['reviewText_cleaned'].apply(lambda x: 1 if len(x.split()) > 10 else 0)

# Menampilkan distribusi label
print("Distribusi label:\n", data['label'].value_counts())

# Langkah 5: Menerapkan teknik TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['reviewText_cleaned'])

# Langkah 6: Memisahkan data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, data['label'], test_size=0.2, random_state=42)

# Langkah 7: Melatih model Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Langkah 8: Melakukan prediksi
y_pred = model.predict(X_test)

# Langkah 9: Menghitung metrik evaluasi
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Menampilkan hasil evaluasi
print("Akurasi:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("\nLaporan Klasifikasi:\n", classification_report(y_test, y_pred))

Analisis Sentimen

# Import library yang dibutuhkan
import pandas as pd
from textblob import TextBlob
from google.colab import files

# 1. Upload file CSV
uploaded = files.upload()
file_path = list(uploaded.keys())[0]  # Ambil nama file yang diunggah
data = pd.read_csv(file_path)

# 2. Memastikan kolom reviewText_cleaned ada
if 'reviewText_cleaned' not in data.columns:
    raise ValueError("Kolom 'reviewText_cleaned' tidak ditemukan dalam file.")

# 3. Fungsi untuk menentukan sentimen
def analisis_sentimen(teks):
    analysis = TextBlob(teks)
    if analysis.sentiment.polarity > 0:
        return 'positif'
    elif analysis.sentiment.polarity < 0:
        return 'negatif'
    else:
        return 'netral'

# 4. Terapkan fungsi ke kolom reviewText_cleaned
data['sentimen'] = data['reviewText_cleaned'].astype(str).apply(analisis_sentimen)

# 5. Hasilkan ringkasan jumlah sentimen
sentimen_summary = data['sentimen'].value_counts()
print("\nRingkasan Sentimen:")
print(sentimen_summary)

# 6. Menyimpan hasil ke file baru
output_path = 'Processed_Reviews_with_Sentiment.csv'
data.to_csv(output_path, index=False)
print(f"\nHasil analisis sentimen disimpan ke: {output_path}")

# 7. Mengunduh file hasil
files.download(output_path)