### Data

In [1]:
import pandas as pd

data = pd.read_csv('dataset.csv')
data.head()

Unnamed: 0,teks,jenis
0,Jikalau kita bertanam padi \n Senanglah makan ...,pantun
1,Kalau ada air bersih \n Bolehlah kita mengguna...,pantun
2,Buah Hatiku \n Satu... \n perlahan hadir... \n...,puisi
3,Anak ayam turun sepuluh \n Mati satu tinggal s...,pantun
4,Kala dulu \n Ada sungai mengalir air \n Selalu...,puisi


In [2]:
# split data train and data test

from sklearn.model_selection import train_test_split

X = data['teks']
y = data['jenis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

### Vectorizing the data

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

### Creating a Linear SVM Model

In [4]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear',probability=True)
t0 = time.time()
classifier_linear.fit(train_vectors, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [5]:
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction_linear, output_dict=True)
print('pantun: ', report['pantun'])
print('puisi: ', report['puisi'])

Training time: 0.616604s; Prediction time: 0.017993s
pantun:  {'precision': 0.9538461538461539, 'recall': 0.9841269841269841, 'f1-score': 0.96875, 'support': 63}
puisi:  {'precision': 0.9850746268656716, 'recall': 0.9565217391304348, 'f1-score': 0.9705882352941176, 'support': 69}


In [6]:
prediction_linear

array(['pantun', 'puisi', 'puisi', 'pantun', 'pantun', 'puisi', 'puisi',
       'pantun', 'pantun', 'puisi', 'puisi', 'pantun', 'puisi', 'pantun',
       'pantun', 'puisi', 'puisi', 'pantun', 'pantun', 'puisi', 'puisi',
       'puisi', 'puisi', 'puisi', 'puisi', 'puisi', 'puisi', 'puisi',
       'pantun', 'pantun', 'puisi', 'pantun', 'pantun', 'puisi', 'pantun',
       'pantun', 'puisi', 'puisi', 'pantun', 'puisi', 'pantun', 'pantun',
       'puisi', 'pantun', 'pantun', 'pantun', 'pantun', 'puisi', 'puisi',
       'puisi', 'puisi', 'puisi', 'pantun', 'pantun', 'puisi', 'pantun',
       'pantun', 'puisi', 'puisi', 'puisi', 'puisi', 'puisi', 'pantun',
       'puisi', 'pantun', 'pantun', 'puisi', 'pantun', 'pantun', 'pantun',
       'pantun', 'puisi', 'puisi', 'pantun', 'pantun', 'pantun', 'puisi',
       'pantun', 'puisi', 'puisi', 'puisi', 'puisi', 'pantun', 'pantun',
       'puisi', 'puisi', 'puisi', 'puisi', 'pantun', 'puisi', 'pantun',
       'pantun', 'puisi', 'puisi', 'puisi', 'pan

### Prediction

In [7]:
teks = """Perlahan menipis gumpalan
Awan langit memutih
Tergenang di kolam penampungan
Sorot mata bocah pengungsi
Berbinar binar
Sepanjang jalan bersorak sorak
Basah badan menarik gerobak
Jirigen jirigen air tertata rapi
Tanah kita tak kering lagi
Hujan telah kembali"""

teks_vector = vectorizer.transform([teks]) # vectorizing
print(classifier_linear.predict(teks_vector))

['puisi']


In [8]:
print(classifier_linear.predict_proba(teks_vector))

[[0.21298043 0.78701957]]


### Pickling the Model

In [9]:
import pickle

# pickling the vectorizer
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))

# pickling the model
pickle.dump(classifier_linear, open('classifier.sav', 'wb'))