In [1]:
from sklearn import linear_model, naive_bayes, svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from helper.dataset_reader import read_tsv
from helper.data_transformer import create_sentence_list
from train_ml import train_model

In [2]:
# read dataset
raw_data = read_tsv("../dataset/ijelid-100622.tsv")
train_data = read_tsv("../dataset/16-07-22/train.tsv")
test_data = read_tsv("../dataset/16-07-22/test.tsv")

In [3]:
X_train = train_data[1]
y_train = train_data[2]
X_test = test_data[1]
y_test = test_data[2]

In [4]:
sent_list = create_sentence_list(data=raw_data)

In [5]:
# Feature Engineering: Count Vectorizer
count_vect = CountVectorizer(min_df=1)

# transform the training and validation data using count vectorizer object
X_train_count = count_vect.fit_transform(X_train).toarray()
X_test_count = count_vect.transform(X_test).toarray()

X_train_count.shape, X_test_count.shape

((89853, 16842), (43938, 16842))

In [6]:
# Feature engineering: TF-IDF Vectorizer
# word level TF-IDF
tfidf_vect = TfidfVectorizer(min_df=1)
tfidf_vect.fit(sent_list)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

# n-gram level TF-IDF
tfidf_vect_ngram = TfidfVectorizer(min_df=1)
tfidf_vect_ngram.fit(sent_list)
X_train_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
X_test_tfidf_ngram = tfidf_vect_ngram.transform(X_test)

# character level TF-IDF
tfidf_vect_ngram_char = TfidfVectorizer(min_df=1)
tfidf_vect_ngram_char.fit(sent_list)
X_train_tfidf_ngram_char = tfidf_vect_ngram_char.transform(X_train)
X_test_tfidf_ngram_char = tfidf_vect_ngram_char.transform(X_test)

In [7]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), X_train_tfidf, X_test_tfidf, y_train, y_test)
print("LR, WordLevel TF-IDF: ", accuracy, f1_score)

              precision    recall  f1-score   support

          EN     0.7075    0.9512    0.8114      3732
          ID     0.8891    0.9437    0.9155     21012
          JV     0.5616    0.9324    0.7010      2959
   MIX-ID-EN     0.7996    0.9229    0.8568       843
   MIX-ID-JV     0.8036    0.9912    0.8876       227
   MIX-JV-EN     0.6282    0.9110    0.7436       191
           O     0.9814    0.6669    0.7941     14974

    accuracy                         0.8489     43938
   macro avg     0.7673    0.9028    0.8157     43938
weighted avg     0.8798    0.8489    0.8489     43938

LR, WordLevel TF-IDF:  0.8489234830898084 0.8488608257239701


In [8]:
# Linear Classifier on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), X_train_tfidf_ngram, X_test_tfidf_ngram, y_train, y_test)
print("LR, N-Gram Vectors: ", accuracy, f1_score)

              precision    recall  f1-score   support

          EN     0.7075    0.9512    0.8114      3732
          ID     0.8891    0.9437    0.9155     21012
          JV     0.5616    0.9324    0.7010      2959
   MIX-ID-EN     0.7996    0.9229    0.8568       843
   MIX-ID-JV     0.8036    0.9912    0.8876       227
   MIX-JV-EN     0.6282    0.9110    0.7436       191
           O     0.9814    0.6669    0.7941     14974

    accuracy                         0.8489     43938
   macro avg     0.7673    0.9028    0.8157     43938
weighted avg     0.8798    0.8489    0.8489     43938

LR, N-Gram Vectors:  0.8489234830898084 0.8488608257239701


In [9]:
# Linear Classifier on Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter=1000), X_train_tfidf_ngram_char, X_test_tfidf_ngram_char, y_train, y_test)
print("LR, CharLevel Vectors: ", accuracy, f1_score)

              precision    recall  f1-score   support

          EN     0.7075    0.9512    0.8114      3732
          ID     0.8891    0.9437    0.9155     21012
          JV     0.5616    0.9324    0.7010      2959
   MIX-ID-EN     0.7996    0.9229    0.8568       843
   MIX-ID-JV     0.8036    0.9912    0.8876       227
   MIX-JV-EN     0.6282    0.9110    0.7436       191
           O     0.9814    0.6669    0.7941     14974

    accuracy                         0.8489     43938
   macro avg     0.7673    0.9028    0.8157     43938
weighted avg     0.8798    0.8489    0.8489     43938

LR, CharLevel Vectors:  0.8489234830898084 0.8488608257239701
