In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Pastikan stopwords dan punkt sudah diunduh
nltk.download('punkt')
nltk.download('stopwords')

# Membuat stemmer untuk bahasa Indonesia menggunakan Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk normalisasi teks
def normalize_text(text):
    # Menghapus karakter non-alfanumerik dan mengubah menjadi huruf kecil
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower()

# Fungsi untuk tokenisasi, stop word removal, dan stemming
def preprocess_text(text):
    # Normalisasi
    text = normalize_text(text)
    
    # Tokenisasi
    tokens = word_tokenize(text)
    
    # Stop word removal
    stop_words = set(stopwords.words('indonesian'))  # Pastikan 'indonesian' telah didownload
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return ' '.join(stemmed_tokens)

# Membaca file CSV
file_path = r'H:\My Drive\analisa sentimen git\percobaan 9\validation_filtered.csv'
data = pd.read_csv(file_path)

# Menghapus duplikat
data = data.drop_duplicates()

# Menampilkan jumlah total data setelah menghapus duplikat
print(f"Jumlah total data setelah menghapus duplikat: {len(data)}")

# Menerapkan preprocessing ke kolom 'content'
data['processed_content'] = data['content'].astype(str).apply(preprocess_text)

# Menghitung jumlah positif dan negatif
jumlah_positif = len(data[data['score'] > 0])
jumlah_negatif = len(data[data['score'] < 0])

print(f"Jumlah positif: {jumlah_positif}")
print(f"Jumlah negatif: {jumlah_negatif}")

# Menyimpan DataFrame yang sudah diproses ke file CSV baru
output_file_path = r'H:\My Drive\analisa sentimen git\percobaan 9\validation_results_processed.csv'
data.to_csv(output_file_path, index=False)

print(f"Data yang sudah diproses disimpan di: {output_file_path}")

# Menampilkan beberapa contoh hasil preprocessing
print(data[['content', 'processed_content']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hisya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hisya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Jumlah total data setelah menghapus duplikat: 98862
Jumlah positif: 98862
Jumlah negatif: 0
Data yang sudah diproses disimpan di: H:\My Drive\analisa sentimen git\percobaan 9\validation_results_processed.csv
                                             content  \
0                            akun gopay saya di blok   
1  lambat sekali sekarang ini bosssku apk gojek g...   
2  baru download gojek dan hape baru trus ditop u...   
3                                          coba dulu   
4  gimana ini kak pin saya salah terus padahal ud...   

                                   processed_content  
0                                    akun gopay blok  
1                   lambat bosssku apk gojek gk kaya  
2  download gojek hape trus top u gopay transaksi...  
3                                               coba  
4               gimana kak pin salah udah ubah salah  


In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Pastikan Anda sudah mengunduh tokenizer yang diperlukan
nltk.download('punkt')

# Baca file CSV
input_path = r"H:\My Drive\analisa sentimen git\percobaan 9\validation_results_processed.csv"
data = pd.read_csv(input_path)

# Tokenisasi pada kolom 'content' dan 'processed_content'
data['content_tokenized'] = data['content'].apply(lambda x: word_tokenize(str(x)))
data['processed_content_tokenized'] = data['processed_content'].apply(lambda x: word_tokenize(str(x)))

# Simpan hasilnya ke file CSV baru
output_path = r"H:\My Drive\analisa sentimen git\percobaan 9\validation_results_tokenized.csv"
data.to_csv(output_path, index=False)

print(f"File berhasil disimpan di {output_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hisya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hisya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hisya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


File berhasil disimpan di H:\My Drive\analisa sentimen git\percobaan 9\validation_results_tokenized.csv
