In [1]:
from sklearn import linear_model, naive_bayes, svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from helper.dataset_reader import read_tsv

In [2]:
raw_data = read_tsv("../dataset/ijelid-100622.tsv")
words = raw_data[1]
tags = raw_data[2]

In [3]:
# create list of sentences
from helper.data_transformer import *

word_lists, tag_lists = get_list_words_tags(raw_data[0])
sent_list = []
for word in word_lists:
	sent = ' '.join(word)
	sent_list.append(sent)

In [4]:
data_tuples = list(zip(words, tags))
df = pd.DataFrame(data_tuples, columns=['Word', 'Tag'])
X = df['Word']
y = df['Tag']

In [9]:
df['Word']

0           Tumben
1               xl
2           banter
3               go
4         download
            ...   
133786     apalagi
133787         ini
133788       video
133789        call
133790          ~~
Name: Word, Length: 133791, dtype: object

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [12]:
X_train

122715           !
88664         luck
84985         lega
124954        mbok
60776       jirrrr
            ...   
41993         Piye
97639         rare
95939      disuruh
117952    internet
43567        belum
Name: Word, Length: 89639, dtype: object

In [6]:
# Feature Engineering: Count Vectorizer
count_vect = CountVectorizer(min_df=1)

# transform the training and validation data using count vectorizer object
X_train_count = count_vect.fit_transform(X_train).toarray()
X_test_count = count_vect.transform(X_test).toarray()

X_train_count.shape, X_test_count.shape

((89639, 16973), (44152, 16973))

In [7]:
# Feature engineering: TF-IDF Vectorizer
# word level TF-IDF
tfidf_vect = TfidfVectorizer(min_df=1)
tfidf_vect.fit(sent_list)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

# n-gram level TF-IDF
tfidf_vect_ngram = TfidfVectorizer(min_df=1)
tfidf_vect_ngram.fit(sent_list)
X_train_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
X_test_tfidf_ngram = tfidf_vect_ngram.transform(X_test)

# character level TF-IDF
tfidf_vect_ngram_char = TfidfVectorizer(min_df=1)
tfidf_vect_ngram_char.fit(sent_list)
X_train_tfidf_ngram_char = tfidf_vect_ngram_char.transform(X_train)
X_test_tfidf_ngram_char = tfidf_vect_ngram_char.transform(X_test)

In [8]:
def train_model(clf, feature_vect_train, label, feature_vect_test):
	clf.fit(feature_vect_train, label)
	predict = clf.predict(feature_vect_test)

	acc = metrics.accuracy_score(predict, y_test)
	f1 = metrics.f1_score(predict, y_test, average='weighted')
	print(metrics.classification_report(predict, y_test, digits=4))
	return acc, f1

In [10]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), X_train_tfidf, y_train, X_test_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy, f1_score)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy, f1_score)

# Linear Classifier on Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print("LR, CharLevel Vectors: ", accuracy, f1_score)

              precision    recall  f1-score   support

          EN       0.71      0.96      0.82      3960
          ID       0.89      0.94      0.92     21144
          JV       0.57      0.93      0.71      2931
   MIX-ID-EN       0.79      0.92      0.85       829
   MIX-ID-JV       0.74      0.99      0.85       229
   MIX-JV-EN       0.66      0.92      0.76       202
           O       0.98      0.67      0.80     14857

    accuracy                           0.85     44152
   macro avg       0.76      0.91      0.82     44152
weighted avg       0.88      0.85      0.85     44152

LR, WordLevel TF-IDF:  0.8528492480521833 0.8526948607822297
              precision    recall  f1-score   support

          EN       0.71      0.96      0.82      3960
          ID       0.89      0.94      0.92     21144
          JV       0.57      0.93      0.71      2931
   MIX-ID-EN       0.79      0.92      0.85       829
   MIX-ID-JV       0.74      0.99      0.85       229
   MIX-JV-EN     