In [1]:
# Import Library

import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import learning_curve

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Dataset
path = '../dataset/data_modelling/data_preprocessing.csv'
dataset = pd.read_csv(path, index_col=None)
dataset.head()

Unnamed: 0,Tweet,HS,Abusive,Char_Length,Casefolding,Stopwords,Char_Length_Prep
0,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,138,41 kadang aku berfikir kenapa aku tetap percay...,41 kadang berpikir percaya tuhan jatuh berkali...,141
1,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,120,aku itu aku n nku tau matamu sipit tapi dilia...,ku tau matamu sipit,19
2,deklarasi pilkada 2018 aman dan anti hoax warg...,0,0,254,deklarasi pilkada 2018 aman dan anti hoax warg...,deklarasi pilihan kepala daerah 2018 aman anti...,75
3,Nah admin belanja satu lagi port terbaik nak m...,0,0,75,nah admin belanja satu lagi po terbaik nak mak...,admin belanja po terbaik nak makan ais kepal m...,171
4,Kalo belajar ekonomi mestinya jago memprivatis...,0,0,89,kalo belajar ekonomi mestinya jago memprivatis...,belajar ekonomi mestinya jago memprivatisasi h...,66


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6971 entries, 0 to 6970
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Tweet             6971 non-null   object
 1   HS                6971 non-null   int64 
 2   Abusive           6971 non-null   int64 
 3   Char_Length       6971 non-null   int64 
 4   Casefolding       6971 non-null   object
 5   Stopwords         6971 non-null   object
 6   Char_Length_Prep  6971 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 381.4+ KB


In [4]:
dataset = dataset.drop(['Tweet', 'Casefolding', 'Char_Length_Prep'], axis=1)
dataset = dataset[['Stopwords', 'HS', 'Abusive']]
dataset = dataset.rename(columns={'Stopwords':'Tweet'})
dataset.head()

Unnamed: 0,Tweet,HS,Abusive
0,41 kadang berpikir percaya tuhan jatuh berkali...,0,0
1,ku tau matamu sipit,0,0
2,deklarasi pilihan kepala daerah 2018 aman anti...,0,0
3,admin belanja po terbaik nak makan ais kepal m...,0,0
4,belajar ekonomi mestinya jago memprivatisasi h...,0,0


# Dataset Splitting

In [5]:
# Assignment Variabel X, y

X = dataset.Tweet
y = dataset.drop(columns='Tweet')
X.shape, y.shape

((6971,), (6971, 2))

In [6]:
# train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4879,), (2092,), (4879, 2), (2092, 2))

# TFIDF Vectorizer

In [None]:
from luwiji.text_proc import illustration, demo
demo.text

In [None]:
illustration.structured

In [None]:
illustration.bag_of_words

In [None]:
illustration.inverse_df

In [None]:
illustration.practical_idf

In [None]:
# Build Vector

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X_train_tfidf = tfidf.fit(X_train.values.astype('U'))

X_train_tfidf = tfidf.transform(X_train.values.astype('U'))
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))

In [None]:
df = pd.DataFrame(X_train_tfidf.todense().T,
                 index = tfidf.get_feature_names(),
                 columns=[f'D{i+1}' for i in range(len(X_train))])
df

In [None]:
# assignment label

label_names = ['HS', 'Abusive']

# K-Fold Cross Validation

In [None]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
kfold

# Logistic Regression

In [None]:
from luwiji.logistic_regression import illustration, demo

illustration.multinomial

In [None]:
demo.loss_curve()

In [None]:
demo.logistic_regression()

In [None]:
demo.loss_plane()

In [None]:
model_logreg = LogisticRegression()

# Randomized Search CV

In [None]:
# Hyperparameter Tuning

hyperparameters_logit =  {
    'penalty' : ['l2', 'l1'],
    'C' :  np.linspace(0, 10, 150),
    'solver' : ['liblinear', 'lbfgs']
}

grid = RandomizedSearchCV(model_logreg, hyperparameters_logit,
                          cv=kfold,
                          n_jobs=-1,
                          verbose=200, return_train_score=True)

# MultiOutput Classifier

In [None]:
clf = MultiOutputClassifier(grid)
clf.fit(X_train_tfidf, y_train)

## fit logistic regression model

In [None]:
# Learning Curve
train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
    clf,
    X_train_tfidf,
    y_train,
    return_times=True)

In [None]:
data = train_sizes, train_scores, test_scores, fit_times
pd.DataFrame(data)

train_sizes.shape, train_scores.shape, test_scores.shape, fit_times.shape

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, label = "Training Score")
plt.plot(train_sizes, test_mean, label = "Cross-Validation Score")

plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean-test_std, test_mean+test_std, color="#DDDDDD")

plt.title("Learning Curve")
plt.xlabel("Training Size")
plt.ylabel("Accuracy Score")
plt.legend()

In [None]:
clf

# Metrics Classification

In [None]:
y_pred = clf.predict(X_test_tfidf)

## classification report

In [None]:
print(classification_report(y_test, y_pred,target_names=label_names))

## accuracy metrics

In [None]:
print("Accuracy Score Data Training", clf.score(X_train_tfidf, y_train))
print("Accuracy Score Data Testing", clf.score(X_test_tfidf, y_test))

In [None]:
score_lr = accuracy_score(y_pred, y_test)
print(score_lr)

In [None]:
clf.estimators_

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

# Hyperparameter Tuning

n_neighbors = range(1, 51, 2)
weights = ['uniform', 'distance']

hyperparameters_knn = dict(n_neighbors=n_neighbors,
                           weights=weights)

grid_knn = RandomizedSearchCV(knn,
                    hyperparameters_knn,
                    cv=kfold,
                    n_jobs=-1,
                    verbose=200)

clf_knn = MultiOutputClassifier(grid_knn).fit(X_train_tfidf, y_train)

y_pred = clf_knn.predict(X_test_tfidf)

In [None]:
print(classification_report(y_test, y_pred,target_names=label_names))

In [None]:
print("Accuracy Score Data Training", clf_knn.score(X_train_tfidf, y_train))
print("Accuracy Score Data Testing", clf_knn.score(X_test_tfidf, y_test))

# Save Model

In [None]:
model_path = "/home/dadandw/scrispy/build_model/skripsi/model/algo_model/model_logreg.jlib"

In [None]:
joblib.dump(clf, model_path)