# Sentimnet Analyst Classification
##### `Sentiment atau emosi yang diungkapkan dalam teks bersifat positif, negatif dan netral`

# Data Cleansing & Preprocessing

### Mengimpor Data TSV
##### Struktur Data TSV File TSV `terdiri dari beberapa baris` di mana setiap baris merepresentasikan satu `record atau entri data`

In [1]:
import pandas as pd
import re
import torch
import random
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset


# Fungsi untuk set seed

In [2]:
# Fungsi untuk set seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(27)


# Fungsi Untuk Membersihkan Teks

In [3]:

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'(.)\1+', r'\1\1', text)
    return text 



# Fungsi Untuk Normalisasi Teks

In [4]:

def normalize_text(text):
    replacements = {
        'tdk': 'tidak', 'dmn': 'dimana', 'cpt': 'cepat', 'ga': 'tidak', 'enggak': 'tidak',
        'ngga': 'tidak', 'gak': 'tidak', 'bgt': 'banget', 'bgd': 'banget', 'bnyk': 'banyak',
        'bgtu': 'begitu', 'dgn': 'dengan', 'hrs': 'harus', 'knp': 'kenapa', 'lgsg': 'langsung',
        'ngerti': 'mengerti', 'pake': 'pakai', 'sangat2': 'sangat', 'sukak': 'suka', 'syg': 'sayang',
        'ttg': 'tentang', 'utk': 'untuk', 'wkt': 'waktu', 'yaa': 'ya', 'bgitu': 'begitu', 'ak': 'aku',
        'kau': 'kamu', 'saya': 'aku', 'anda': 'kamu', 'kami': 'kita', 'kalian': 'kamu', 'dirimu': 'kamu',
        'dirinya': 'dia', 'diriku': 'aku'
    }
    for key, value in replacements.items():
        text = text.replace(key, value)
    return text


# Membaca dan Memproses Data

In [5]:
# Membaca dan memproses data
df_train = pd.read_csv('train_preprocess.tsv', sep='\t', header=None)
df_valid = pd.read_csv('valid_preprocess.tsv', sep='\t', header=None)
df_test = pd.read_csv('test_preprocess.tsv', sep='\t', header=None)

df_train.columns = ['review_text', 'category']
df_valid.columns = ['review_text', 'category']
df_test.columns = ['review_text', 'category']


In [6]:
df_train

Unnamed: 0,review_text,category
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
10995,tidak kecewa,positive
10996,enak rasa masakan nya apalagi kepiting yang me...,positive
10997,hormati partai-partai yang telah berkoalisi,neutral
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative


# Mengonversi label teks menjadi numberik 

In [7]:

label_mapping = {'positive': 1, 'negative': 0,'neutral': 2}
df_train['category'] = df_train['category'].map(label_mapping)
df_valid['category'] = df_valid['category'].map(label_mapping)
df_test['category'] = df_test['category'].map(label_mapping)

# Konversi kolom 'category' ke tipe data numberik dan pastikan tidak ada yang NaN

In [8]:
# Konversi kolom 'category' ke tipe data numerik dan pastikan tidak ada NaN
df_train = df_train.dropna(subset=['category'])
df_valid = df_valid.dropna(subset=['category'])
df_test = df_test.dropna(subset=['category'])

df_train['category'] = df_train['category'].astype(int)
df_valid['category'] = df_valid['category'].astype(int)
df_test['category'] = df_test['category'].astype(int)



In [9]:
df_train

Unnamed: 0,review_text,category
0,warung ini dimiliki oleh pengusaha pabrik tahu...,1
1,mohon ulama lurus dan k212 mmbri hujjah partai...,2
2,lokasi strategis di jalan sumatera bandung . t...,1
3,betapa bahagia nya diri ini saat unboxing pake...,1
4,duh . jadi mahasiswa jangan sombong dong . kas...,0
...,...,...
10995,tidak kecewa,1
10996,enak rasa masakan nya apalagi kepiting yang me...,1
10997,hormati partai-partai yang telah berkoalisi,2
10998,"pagi pagi di tol pasteur sudah macet parah , b...",0


# Mengambil sampel data menjadi 1000 sampel

In [10]:
df_train.count()

review_text    11000
category       11000
dtype: int64

In [11]:
df_test.count()

review_text    500
category       500
dtype: int64

In [12]:
df_valid.count()

review_text    1260
category       1260
dtype: int64

In [13]:
df_train_sampled = df_train.sample(n=500, random_state=27)
df_valid_sampled = df_valid.sample(n=500, random_state=27)
df_test_sampled = df_test.sample(n=500, random_state=27)



# Membersihkan dan Normalisasi Teks

In [14]:
# Membersihkan dan normalisasi teks
df_train_sampled['review_text'] = df_train_sampled['review_text'].apply(lambda x: clean_text(str(x).lower()))
df_valid_sampled['review_text'] = df_valid_sampled['review_text'].apply(lambda x: clean_text(str(x).lower()))
df_test_sampled['review_text'] = df_test_sampled['review_text'].apply(lambda x: clean_text(str(x).lower()))

df_train_sampled['review_text'] = df_train_sampled['review_text'].apply(normalize_text)
df_valid_sampled['review_text'] = df_valid_sampled['review_text'].apply(normalize_text)
df_test_sampled['review_text'] = df_test_sampled['review_text'].apply(normalize_text)



# Cetak Hasil Pembersihkan dan Normalisasi 

In [15]:

print("Hasil Pembersihan dan Normalisasi Data Train:")
print(df_train_sampled.head())

print("\nHasil Pembersihan dan Normalisasi Data Valid:")
print(df_valid_sampled.head())

print("\nHasil Pembersihan dan Normalisasi Data Test:")
print(df_test_sampled.head())

Hasil Pembersihan dan Normalisasi Data Train:
                                            review_text  category
5053                                        tidaku rapi         0
6436  makuan di saung di tentidakuh sawah makuanan n...         1
709   resto ini bekas tempat penggertidakujian kayu ...         1
7844  jika sedang di cihampelas dan cari oleholeh co...         1
5584  pada bulan oktober lalu aku dan teman aku maku...         1

Hasil Pembersihan dan Normalisasi Data Valid:
                                            review_text  category
556   menemukan tempat makuan ini secara tidaku sent...         1
1237  tempat nya nyaman dentidakun pemkamuntidakun k...         1
747                                  jokowi kerja nyata         1
329   dedi mulyadi dan ridwan kital diangtidakup kep...         2
869   nah di sini letaku permasalahan nya bang demok...         0

Hasil Pembersihan dan Normalisasi Data Test:
                                           review_text  category
37  

# Menginisialisasi Tokenizer BERT

In [16]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [17]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Fungsi Untuk Tokenisasi dan Padding

In [18]:

def tokenize_and_pad(sentences, max_length=64):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,                      # Kalimat yang akan ditokenisasi
            add_special_tokens=True,   # Tambahkan '[CLS]' dan '[SEP]'
            max_length=max_length,     # Padding & truncation length
            padding='max_length',      # Pad ke max_length
            return_attention_mask=True, # Return attention mask
            return_tensors='pt',       # Return pytorch tensors
            truncation=True            # Aktifkan truncation
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Konversi lists ke tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks


# Tokenisasi dan Padding Untuk Dataset

In [19]:

train_inputs, train_masks = tokenize_and_pad(df_train_sampled['review_text'])
valid_inputs, valid_masks = tokenize_and_pad(df_valid_sampled['review_text'])
test_inputs, test_masks = tokenize_and_pad(df_test_sampled['review_text'])

# Ekstraksi Labels

In [20]:

train_labels = torch.tensor(df_train_sampled['category'].values).long()
valid_labels = torch.tensor(df_valid_sampled['category'].values).long()
test_labels = torch.tensor(df_test_sampled['category'].values).long()


# Membuat TensorDataset

In [21]:

# Membuat TensorDataset
train_data = TensorDataset(train_inputs, train_masks, train_labels)
valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)


# Membuat DataLoader

In [22]:
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)

# Inisialisasi model BERT untuk klasifikasi 

In [23]:

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Menentukan Device

In [24]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Inisialisasi Optimizer

In [25]:

optimizer = AdamW(model.parameters(), lr=2e-5)



# Fungsi Evaluasi 

In [26]:

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_accuracy = 0

    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device).long()  # Ubah ke tipe data long

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

        loss = criterion(logits, b_labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == b_labels).cpu().numpy().mean()
        total_accuracy += accuracy

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    
    return avg_loss, avg_accuracy


# Menentukan Loss Function 

In [27]:
# Menentukan loss function
criterion = torch.nn.CrossEntropyLoss()


# Pelatihan

In [28]:

model.train()
for epoch in range(4):  # Lakukan 4 epoch
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device).long()  # Ubah ke tipe data long

        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss}")

    # Evaluasi
    eval_loss, eval_accuracy = evaluate(model, valid_dataloader, criterion, device)
    print(f"Validation Loss: {eval_loss}, Validation Accuracy: {eval_accuracy}")



Average Training Loss: 0.9113919660449028
Validation Loss: 0.7681390661746264, Validation Accuracy: 0.625
Average Training Loss: 0.719468230381608
Validation Loss: 0.7050008112564683, Validation Accuracy: 0.708984375
Average Training Loss: 0.5834710355848074
Validation Loss: 0.6857857443392277, Validation Accuracy: 0.693359375
Average Training Loss: 0.4515790930017829
Validation Loss: 0.6656732615083456, Validation Accuracy: 0.736328125


# Simpan Model Setelah Pelatihan 

In [None]:
# Simpan model setelah pelatihan
torch.save(model.state_dict(), 'bert_sentiment_model.pth')

In [None]:
pip install flask torch transformers


Note: you may need to restart the kernel to use updated packages.


In [32]:
pip install flask transformers torch





In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from flask import Flask, request, jsonify

# Inisialisasi Flask app
app = Flask(__name__)

# Inisialisasi tokenizer dan model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Muat model yang telah dilatih
model.load_state_dict(torch.load('bert_sentiment_model.pth'))
model.eval()

# Menentukan device (CPU atau GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Fungsi untuk tokenisasi dan padding
def tokenize_and_pad(sentences, max_length=64):
    encoded_dict = tokenizer.encode_plus(
        sentences,                      # Kalimat yang akan ditokenisasi
        add_special_tokens=True,        # Tambahkan '[CLS]' dan '[SEP]'
        max_length=max_length,          # Padding & truncation length
        padding='max_length',           # Pad ke max_length
        return_attention_mask=True,     # Return attention mask
        return_tensors='pt',            # Return pytorch tensors
        truncation=True                 # Aktifkan truncation
    )
    
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)
    
    return input_ids, attention_mask

# Mapping kategori label ke teks
label_mapping = {0: 'negative', 1: 'positive', 2: 'neutral'}

# Endpoint untuk klasifikasi sentimen
@app.route('/predict', methods=['POST'])
def predict():
    # Mendapatkan input teks dari request
    data = request.json
    text = data.get('text')
    
    if not text:
        return jsonify({"error": "No text provided"}), 400
    
    # Tokenisasi dan padding input teks
    input_ids, attention_mask = tokenize_and_pad(text)

    # Lakukan prediksi
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        logits = outputs.logits

    # Mendapatkan prediksi label
    preds = torch.argmax(logits, dim=1).flatten()
    sentiment = label_mapping[preds.item()]

    # Mengembalikan hasil dalam format JSON
    return jsonify({"output": sentiment})

# Menjalankan Flask app
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
