In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer

# Load data
data = pd.read_csv('label_data.csv')

# Preprocess data
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = word_tokenize(text)
    # Remove stop words and apply stemming
    stop_words = set(stopwords.words('indonesian'))  # Use the appropriate stop words list
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['keyword'] = data['keyword'].apply(preprocess_text)

# Prepare data
X = data['keyword']
y = data['label']

display(data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train logistic regression model with hyperparameter tuning
param_grid = {
    'C': [0.1, 0.5, 1, 5, 10],
    'max_iter': [100, 200, 300],
    'solver': ['lbfgs', 'liblinear']
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Best model
best_model = grid_search.best_estimator_

# Save model to pretrained model
import joblib
joblib.dump(best_model, 'RAILS1.0alpha_text_classification.pkl')
joblib.dump(vectorizer, 'Tfidf_vectorizer.pkl')

# Predict on test data
y_pred = best_model.predict(X_test_tfidf)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

def classify_comment(comment):
    comment = preprocess_text(comment)
    comment_tfidf = vectorizer.transform([comment])
    prediction = best_model.predict(comment_tfidf)
    return prediction[0]

# Example usage
new_comment = "hello cantik"
label = classify_comment(new_comment)
print(f"Label: {label}")

Unnamed: 0,label,keyword
0,cod,cod
1,cod,bayar tunai
2,cod,bayar
3,cod,cash on delivery
4,halo,hello
5,halo,hello kak
6,halo,hai
7,halo,halo
8,halo,selamat
9,halo,hi




Accuracy: 0.1
              precision    recall  f1-score   support

      desain       0.00      0.00      0.00         0
      diskon       0.00      0.00      0.00         4
        halo       1.00      1.00      1.00         1
    kualitas       0.00      0.00      0.00         2
       ready       0.00      0.00      0.00         3

    accuracy                           0.10        10
   macro avg       0.20      0.20      0.20        10
weighted avg       0.10      0.10      0.10        10

Label: halo


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
