In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [7]:
# Load dataset
df = pd.read_excel('/content/drive/MyDrive/00. Drive PC/1.STIS/SEMNAS 2024/data_ml.xlsx')


In [8]:
df.isna().sum()

Cleaned Teks Berita    3
Label Fix              0
dtype: int64

In [9]:
df.dropna(axis=1)
df.dropna(how='all')
df = df.dropna(subset=['Cleaned Teks Berita'])

In [10]:
df.isna().sum()

Cleaned Teks Berita    0
Label Fix              0
dtype: int64

In [11]:
# Features and Labels
X = df['Cleaned Teks Berita']
y = df['Label Fix']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes

In [16]:
# Define pipeline
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9, ngram_range=(1, 1))),  # Use fixed parameters for TfidfVectorizer
    ('nb', MultinomialNB())
])

# Define hyperparameters
param_grid_nb = {
    'nb__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

# GridSearchCV
grid_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, n_jobs=-1, verbose=2)
grid_nb.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters for Naive Bayes:", grid_nb.best_params_)
y_pred_nb = grid_nb.predict(X_test)
print(classification_report(y_test, y_pred_nb, digits=4))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters for Naive Bayes: {'nb__alpha': 0.1}
              precision    recall  f1-score   support

   ihsg naik     0.6991    0.8777    0.7783       368
  ihsg turun     0.7152    0.4484    0.5512       252

    accuracy                         0.7032       620
   macro avg     0.7072    0.6631    0.6648       620
weighted avg     0.7057    0.7032    0.6860       620



# Random Forest

In [18]:
# Define pipeline
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9, ngram_range=(1, 1))),  # Use fixed parameters for TfidfVectorizer
    ('rf', RandomForestClassifier())
])

# Define hyperparameters
param_grid_rf = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30]
}

# GridSearchCV
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_rf.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters for Random Forest:", grid_rf.best_params_)
y_pred_rf = grid_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf,digits=4))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for Random Forest: {'rf__max_depth': 30, 'rf__n_estimators': 200}
              precision    recall  f1-score   support

   ihsg naik     0.7603    0.9049    0.8263       368
  ihsg turun     0.8077    0.5833    0.6774       252

    accuracy                         0.7742       620
   macro avg     0.7840    0.7441    0.7519       620
weighted avg     0.7795    0.7742    0.7658       620



# SVM

In [19]:
# Define pipeline
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Define hyperparameters
param_grid_svm = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'svm__gamma': ['scale', 'auto']
}

# GridSearchCV
grid_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, n_jobs=-1, verbose=2)
grid_svm.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters for SVM:", grid_svm.best_params_)
y_pred_svm = grid_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm,digits=4))

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters for SVM: {'svm__C': 100, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
              precision    recall  f1-score   support

   ihsg naik     0.8464    0.8533    0.8498       368
  ihsg turun     0.7831    0.7738    0.7784       252

    accuracy                         0.8210       620
   macro avg     0.8147    0.8135    0.8141       620
weighted avg     0.8207    0.8210    0.8208       620

