In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [7]:
# Load dataset
df = pd.read_excel('/content/drive/MyDrive/00. Drive PC/1.STIS/SEMNAS 2024/data_ml_filtered.xlsx')
df

Unnamed: 0,Cleaned Teks Berita,Label
0,indeks harga saham gabung ihsg pagi buka zona ...,ihsg naik
1,indeks harga saham gabung ihsg pagi buka zona ...,ihsg naik
2,indeks harga saham gabung ihsg tutup dagang le...,ihsg turun
3,indeks harga saham gabung ihsg awal pekan zona...,ihsg naik
4,indeks harga saham gabung ihsg zona hijau harg...,ihsg naik
...,...,...
2458,indeks harga saham gabung ihsg tutup zona nega...,ihsg turun
2459,indeks harga saham gabung ihsg tutup zona hija...,ihsg naik
2460,nilai tukar dolar amerika serikat as rupiah le...,ihsg naik
2461,indeks harga saham gabung ihsg gerak zona mera...,ihsg turun


In [8]:
df.isna().sum()

Cleaned Teks Berita    0
Label                  0
dtype: int64

In [9]:
# Features and Labels
X = df['Cleaned Teks Berita']
y = df['Label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes

In [10]:
# Define pipeline
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9, ngram_range=(1, 1))),  # Use fixed parameters for TfidfVectorizer
    ('nb', MultinomialNB())
])

# Define hyperparameters
param_grid_nb = {
    'nb__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

# GridSearchCV
grid_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, n_jobs=-1, verbose=2)
grid_nb.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters for Naive Bayes:", grid_nb.best_params_)
y_pred_nb = grid_nb.predict(X_test)
print(classification_report(y_test, y_pred_nb, digits=4))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters for Naive Bayes: {'nb__alpha': 0.1}
              precision    recall  f1-score   support

   ihsg naik     0.6667    0.8444    0.7451       270
  ihsg turun     0.7219    0.4888    0.5829       223

    accuracy                         0.6836       493
   macro avg     0.6943    0.6666    0.6640       493
weighted avg     0.6916    0.6836    0.6717       493



# Random Forest

In [11]:
# Define pipeline
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9, ngram_range=(1, 1))),  # Use fixed parameters for TfidfVectorizer
    ('rf', RandomForestClassifier())
])

# Define hyperparameters
param_grid_rf = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30]
}

# GridSearchCV
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_rf.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters for Random Forest:", grid_rf.best_params_)
y_pred_rf = grid_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf,digits=4))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for Random Forest: {'rf__max_depth': None, 'rf__n_estimators': 200}
              precision    recall  f1-score   support

   ihsg naik     0.7308    0.9148    0.8125       270
  ihsg turun     0.8516    0.5919    0.6984       223

    accuracy                         0.7688       493
   macro avg     0.7912    0.7534    0.7555       493
weighted avg     0.7854    0.7688    0.7609       493



# SVM

In [12]:
# Define pipeline
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Define hyperparameters
param_grid_svm = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'svm__gamma': ['scale', 'auto']
}

# GridSearchCV
grid_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, n_jobs=-1, verbose=2)
grid_svm.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters for SVM:", grid_svm.best_params_)
y_pred_svm = grid_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm,digits=4))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters for SVM: {'svm__C': 100, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
              precision    recall  f1-score   support

   ihsg naik     0.8175    0.8630    0.8396       270
  ihsg turun     0.8221    0.7668    0.7935       223

    accuracy                         0.8195       493
   macro avg     0.8198    0.8149    0.8166       493
weighted avg     0.8196    0.8195    0.8188       493

