In [11]:
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

In [12]:
df = pd.read_csv('all_sentiment.csv', sep=',')
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,clean,pemilu,provinsi,pulau,sentimen,sentimenlama,stemming_text,tanggal,tweet,tweet_text,twit,waktu
0,sesi debat bahasa inggris kapar,pre,Banten,Jawa,positif,,sesi debat bahasa inggris kapar,1/17/201911:55:00 AM,sesi debat berbahasa inggris terkapar,sesi debat berbahasa inggris terkapar,,debat 1
1,40 jaring santri nasional jksn magelang raya l...,pre,Banten,Jawa,positif,,40 jaring santri nasional jksn magelang raya l...,1/17/201911:55:00 AM,40 jaringan santri nasional jksn magelang raya...,40 jaringan santri nasional jksn magelang ray...,,debat 1
2,jaga selamat tim lapang kumpul bukti2 ijazah s...,pre,Banten,Jawa,negatif,,jaga selamat tim lapang kumpul bukti2 ijazah s...,1/17/201911:55:00 AM,dijaga keselamatan tim lapangan mengumpulkan b...,dijaga keselamatan tim lapangan mengumpulkan ...,,debat 1
3,mudah tebak berapi2 belepotan penuh bohong hoa...,pre,Banten,Jawa,negatif,,mudah tebak berapi2 belepotan penuh bohong hoa...,1/17/201911:55:00 AM,mudah ditebak berapi2 belepotan penuh bohong h...,mudah ditebak berapi2 belepotan penuh bohong...,,debat 1
4,selamat segala sifat keras hati picik angkara ...,pre,Banten,Jawa,positif,,selamat siang segala sifat keras hati picik an...,1/17/201911:55:00 AM,selamat siang segala sifat keras hati picik an...,selamat siang segala sifat keras hati picik an...,,debat 1


In [13]:
data = df.filter(['clean','sentimen'])
data.head()

Unnamed: 0,clean,sentimen
0,sesi debat bahasa inggris kapar,positif
1,40 jaring santri nasional jksn magelang raya l...,positif
2,jaga selamat tim lapang kumpul bukti2 ijazah s...,negatif
3,mudah tebak berapi2 belepotan penuh bohong hoa...,negatif
4,selamat segala sifat keras hati picik angkara ...,positif


In [14]:
#edit column name
data = data.rename(columns={'clean':'cleanText', 'sentimen':'label'})
data.head()

Unnamed: 0,cleanText,label
0,sesi debat bahasa inggris kapar,positif
1,40 jaring santri nasional jksn magelang raya l...,positif
2,jaga selamat tim lapang kumpul bukti2 ijazah s...,negatif
3,mudah tebak berapi2 belepotan penuh bohong hoa...,negatif
4,selamat segala sifat keras hati picik angkara ...,positif


In [16]:
#label encoder, 1 = positif & 0 = negatif
labelencoder = LabelEncoder()
data['label'] = labelencoder.fit_transform(data['label'])
print(data.label.unique())
print(data.head())

[1 0]
                                           cleanText  label
0                    sesi debat bahasa inggris kapar      1
1  40 jaring santri nasional jksn magelang raya l...      1
2  jaga selamat tim lapang kumpul bukti2 ijazah s...      0
3  mudah tebak berapi2 belepotan penuh bohong hoa...      0
4  selamat segala sifat keras hati picik angkara ...      1


In [17]:
#SVM
pipeline_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True)),
    ('clf', LinearSVC())
])

X_train, X_test, y_train, y_test = train_test_split(data['cleanText'], data['label'],  test_size=0.33, random_state = 0)
pipeline_svm.fit(X_train, y_train)
predictions = pipeline_svm.predict(X_test)

print("Accuracy: {}".format(accuracy_score(y_test, predictions)))
print("F1 Score: {}".format(f1_score(y_test, predictions)))
print("Precision score: {}".format(precision_score(y_test, predictions)))
print("Recall score: {}".format(recall_score(y_test, predictions)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, predictions)))
print (classification_report(y_test, predictions))

Accuracy: 0.8373673480056458
F1 Score: 0.8188329839273236
Precision score: 0.8270689959414153
Recall score: 0.8107593841895866
Confusion matrix:
 [[17975  2940]
 [ 3282 14061]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.85     20915
           1       0.83      0.81      0.82     17343

    accuracy                           0.84     38258
   macro avg       0.84      0.84      0.84     38258
weighted avg       0.84      0.84      0.84     38258



In [22]:
predictions = pipeline_svm.predict(["hebat"])
predictions

array([1])

In [23]:
#Save SVM model
pkl_filename = 'sentiment_model.pkl'

with open(pkl_filename, 'wb') as file:
    pickle.dump(pipeline_svm, file)