In [1]:
#import the needed libraries 

import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [2]:
#read the dataset

data = pd.read_excel('all-data-clean.xlsx')
#print(data['cleanText'],data['label'])
len(data)

5462

In [3]:
#Multinomial Naive Bayes
pipeline_mnb = Pipeline([
    ('vect', CountVectorizer()),
    #('vect', CountVectorizer(ngram_range=(2,2))),
    ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True)),
    ('clf', MultinomialNB(alpha=1))
])

X_train, X_test, y_train, y_test = train_test_split(data['cleanText'], data['label'],  test_size=0.33, random_state = 0)
pipeline_mnb.fit(X_train, y_train)
predictions = pipeline_mnb.predict(X_test)

print("Accuracy: {0:.4f}".format(accuracy_score(y_test, predictions)))
print("F1 Score: {0:.4f}".format(f1_score(y_test, predictions)))
print("Precision score: {0:.4f}".format(precision_score(y_test, predictions)))
print("Recall score: {0:.4f}".format(recall_score(y_test, predictions)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, predictions)))
print (classification_report(y_test, predictions))

Accuracy: 0.9834
F1 Score: 0.9836
Precision score: 0.9912
Recall score: 0.9762
Confusion matrix:
 [[871   8]
 [ 22 902]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       879
           1       0.99      0.98      0.98       924

    accuracy                           0.98      1803
   macro avg       0.98      0.98      0.98      1803
weighted avg       0.98      0.98      0.98      1803



In [4]:
predictions = pipeline_mnb.predict(["dasar brengsek","payah keparat","saya sangat menyukai kerja anda"])
predictions

array([0, 0, 1])

In [5]:
#SVM
pipeline_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True)),
    ('clf', LinearSVC())
])

X_train, X_test, y_train, y_test = train_test_split(data['cleanText'], data['label'],  test_size=0.33, random_state = 0)
pipeline_svm.fit(X_train, y_train)
predictions = pipeline_svm.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, predictions)))
print("F1 Score: {}".format(f1_score(y_test, predictions)))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, predictions)))
print (classification_report(y_test, predictions))

Accuracy: 0.9916805324459235
F1 Score: 0.9919050188882892
Precision score: 0.9892357373519914
Recall score: 0.9945887445887446
Confusion matrix:
 [[869  10]
 [  5 919]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       879
           1       0.99      0.99      0.99       924

    accuracy                           0.99      1803
   macro avg       0.99      0.99      0.99      1803
weighted avg       0.99      0.99      0.99      1803



In [6]:
predictions = pipeline_svm.predict(["dasar brengsek"])
predictions

array([0])

In [7]:
#Save SVM model
pkl_filename = 'svm_abusive_model.pkl'

with open(pkl_filename, 'wb') as file:
    pickle.dump(pipeline_svm, file)

In [8]:
#Load SVM model

with open(pkl_filename, 'rb') as file:
    loaded_model = pickle.load(file)

loaded_model

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC())])

In [9]:
#Use loaded model to predict new sentence

score = loaded_model.score(X_test, y_test)

print("Test score: {0:.2f} %".format(score))

y_pred = loaded_model.predict(X_test)
y_pred

Test score: 0.99 %


array([1, 1, 1, ..., 1, 1, 1])