In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy import sparse
import pickle

In [None]:
df = pd.read_csv("preprocessed_data.csv")
df = df[["preprocessed", "sentiment"]]
df.dropna(subset=['preprocessed'], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3995 entries, 0 to 3997
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   preprocessed  3995 non-null   object
 1   sentiment     3995 non-null   object
dtypes: object(2)
memory usage: 93.6+ KB


In [None]:
df

Unnamed: 0,preprocessed,sentiment
0,tolong min pilih menu sl trader min,positive
1,coin fash biar tinggal exchange masaah,positive
2,fiturnya oke fee kompetitif tolong unggah doku...,positive
3,tolong chart binance hitung mundur waktu,positive
4,alamat wallet failed min coba setor aplikasi b...,positive
...,...,...
3993,konfirmasi kode verifikasi email copy paste an...,negative
3994,weak lag parah close harga koin cocok fast tra...,negative
3995,ribet wd susah alamat salah alamat salah puyeng,negative
3996,tokocrypto hormat verifikasi akun bantu kembang,negative


In [None]:
df["sentiment"].value_counts()

negative    1999
positive    1996
Name: sentiment, dtype: int64

# TF-IDF

In [None]:
# Text Vectorization (TF-IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["preprocessed"])
y = df["sentiment"]

In [None]:
type(X)

scipy.sparse._csr.csr_matrix

In [None]:
print("Jumlah fitur:", X.shape[1])

Jumlah fitur: 4304


In [None]:
# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define SVM Model and Hyperparameter Grid for Grid Search
svm_model = SVC()
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'gamma': [1, 0.1, 0.01]}

# Perform Grid Search with k-fold Cross-Validation
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

grid_search = GridSearchCV(svm_model, param_grid, cv=kf, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Access Accuracy Scores for Each Combination
results = grid_search.cv_results_

# Print accuracy scores for each combination
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"Accuracy: {mean_score:.4f}, Parameters: {params}")

# Get the Best Parameters and Train Final Model
best_params = grid_search.best_params_
best_svm_model = SVC(**best_params)
best_svm_model.fit(X_train, y_train)

# Evaluate Final Model on Test Set
y_pred = best_svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\nBest Parameters:", best_params)
print("Accuracy on Test Set:", accuracy)

Accuracy: 0.7722, Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Accuracy: 0.7462, Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.5181, Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'poly'}
Accuracy: 0.7725, Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'sigmoid'}
Accuracy: 0.7722, Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.7253, Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
Accuracy: 0.5053, Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
Accuracy: 0.5566, Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'sigmoid'}
Accuracy: 0.7722, Parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}
Accuracy: 0.5053, Parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy: 0.5053, Parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'poly'}
Accuracy: 0.5053, Parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'sigmoid'}
Accuracy: 0.7753, Parameters: {'C': 1, 'gamma': 1, 'kernel': 'linear'}
Accuracy: 0.7766, Parameters: {'C': 1, 'gamma

In [None]:
svm_tfidf_model = "svm_tfidf_model.pkl"

with open(svm_tfidf_model, 'wb') as file:
    pickle.dump(best_svm_model, file)

# Feature engineering

In [None]:
df_feat_eng = pd.read_csv("feat_eng.csv")

In [None]:
X_feat_eng = df_feat_eng.drop(columns=["sentiment", "Unnamed: 0"])
y_feat_eng = df_feat_eng["sentiment"]

In [None]:
X_feat_eng

Unnamed: 0,word_count,letter_count,sentence_count,unique_words_count,words_vs_unique,nouns,adjectives,verbs,length,nouns_vs_length,adjectives_vs_length,verbs_vs_length,nouns_vs_words,adjectives_vs_words,verbs_vs_words,mean_word_len,pos_word,neg_word
0,7,29,1,6,0.857143,6,1,0,35,0.171429,0.028571,0.000000,0.857143,0.142857,0.000000,4.142857,0,0
1,6,33,1,6,1.000000,4,1,0,38,0.105263,0.026316,0.000000,0.666667,0.166667,0.000000,5.500000,0,0
2,14,91,1,13,0.928571,8,4,2,104,0.076923,0.038462,0.019231,0.571429,0.285714,0.142857,6.500000,2,0
3,6,35,1,6,1.000000,5,1,0,40,0.125000,0.025000,0.000000,0.833333,0.166667,0.000000,5.833333,0,0
4,14,85,1,12,0.857143,10,2,2,98,0.102041,0.020408,0.020408,0.714286,0.142857,0.142857,6.071429,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3990,8,48,1,8,1.000000,8,0,0,55,0.145455,0.000000,0.000000,1.000000,0.000000,0.000000,6.000000,1,2
3991,23,119,1,22,0.956522,14,4,4,141,0.099291,0.028369,0.028369,0.608696,0.173913,0.173913,5.173913,3,6
3992,8,40,1,6,0.750000,8,0,0,47,0.170213,0.000000,0.000000,1.000000,0.000000,0.000000,5.000000,0,7
3993,6,42,1,6,1.000000,5,1,0,47,0.106383,0.021277,0.000000,0.833333,0.166667,0.000000,7.000000,3,0


In [None]:
# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_feat_eng, y_feat_eng, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define SVM Model and Hyperparameter Grid for Grid Search
svm_model = SVC()
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'sigmoid'], 'gamma': [1, 0.1, 0.01]}

# Perform Grid Search with k-fold Cross-Validation
# k = 5  # Number of folds
# kf = KFold(n_splits=k, shuffle=True, random_state=42)

grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Access Accuracy Scores for Each Combination
results = grid_search.cv_results_

# Print accuracy scores for each combination
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"Accuracy: {mean_score:.4f}, Parameters: {params}")

# Get the Best Parameters and Train Final Model
best_params = grid_search.best_params_
best_svm_model = SVC(**best_params)
best_svm_model.fit(X_train, y_train)

# Evaluate Final Model on Test Set
y_pred = best_svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\nBest Parameters:", best_params)
print("Accuracy on Test Set:", accuracy)

Accuracy: 0.7143, Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Accuracy: 0.5197, Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.5372, Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'sigmoid'}
Accuracy: 0.7143, Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.7134, Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
Accuracy: 0.5835, Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'sigmoid'}
Accuracy: 0.7143, Parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}
Accuracy: 0.6946, Parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy: 0.6893, Parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'sigmoid'}
Accuracy: 0.7159, Parameters: {'C': 1, 'gamma': 1, 'kernel': 'linear'}
Accuracy: 0.6658, Parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.5335, Parameters: {'C': 1, 'gamma': 1, 'kernel': 'sigmoid'}
Accuracy: 0.7159, Parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.7172, Parameters: {'C': 1, 'gamma': 0.

In [None]:
svm_feateng_model = "svm_feateng_model.pkl"

with open(svm_tfidf_model, 'wb') as file:
    pickle.dump(best_svm_model, file)