In [1]:
# Import Library

import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Dataset

dataset = pd.read_csv('../dataset/data_modelling/data_stopwords_not_neutral.csv', index_col=None)
dataset.head()

Unnamed: 0,Tweet,HS,Abusive,Char_Length,Casefolding,Stopwords
0,- disaat semua cowok berusaha melacak perhatia...,1,1,138,disaat semua cowok berusaha melacak perhatian...,cowok berusaha melacak perhatian lantas remehk...
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,120,siapa yang telat ngasih tau elu edan sarap gu...,telat tau edan sarap bergaul licew
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,254,41 kadang aku berfikir kenapa aku tetap percay...,41 kadang berpikir percaya tuhan jatuh berkali...
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,75,aku itu aku n nku tau matamu sipit tapi dilia...,ku tau matamu sipit
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,89,kaum cebong kapir udah keliatan dongoknya dar...,kaum cebong kafir dongoknya dungu haha


In [3]:
dataset = dataset.drop(['Tweet', 'Casefolding',  'Char_Length'], axis=1)
dataset = dataset[['Stopwords', 'HS', 'Abusive']]
dataset = dataset.rename(columns={'Stopwords':'Tweet'})
dataset.head()

Unnamed: 0,Tweet,HS,Abusive
0,cowok berusaha melacak perhatian lantas remehk...,1,1
1,telat tau edan sarap bergaul licew,0,1
2,41 kadang berpikir percaya tuhan jatuh berkali...,0,0
3,ku tau matamu sipit,0,0
4,kaum cebong kafir dongoknya dungu haha,1,1


In [None]:
# dataset.to_csv("../dataset/data_modelling.csv", index=None)

# Dataset Splitting

In [None]:
# Assignment Variabel X, y

X = dataset.Tweet
y = dataset.drop(columns='Tweet')
X.shape, y.shape

In [None]:
# train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# TFIDF Vectorizer

In [None]:
from luwiji.text_proc import illustration, demo
demo.text

In [None]:
illustration.structured

In [None]:
illustration.bag_of_words

In [None]:
illustration.inverse_df

In [None]:
illustration.practical_idf

## Word Vectorizer

In [None]:
# Build Vector

tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,5))
X_train_tfidf = tfidf.fit(X_train.values.astype('U'))

In [None]:
X_train_tfidf = tfidf.transform(X_train.values.astype('U'))
X_test_tfidf = tfidf.transform(X_test.values.astype('U'))

In [None]:
tfidf.vocabulary_

In [None]:
tfidf.get_feature_names()

In [None]:
df = pd.DataFrame(X_train_tfidf.todense().T,
                 index = tfidf.get_feature_names(),
                 columns=[f'D{i+1}' for i in range(len(X_train))])
df

In [None]:
# df.to_csv("../dataset/tfidf_feature_names.csv", index=True)

## Cosine Similarities

In [None]:
sim = cosine_similarity(X_test_tfidf[10], X_test_tfidf)
sim.argsort()

In [None]:
print(X_train[1050])
print("----")
print(X_train[3020])

In [None]:
print("X_train")
print(X_train_tfidf)
print("-----")
print("X_test")
print(X_test_tfidf)

In [None]:
# assignment label

label_names = ['HS', 'Abusive', 'Neutral']

# MaxAbsScaler

In [None]:
scaler = MaxAbsScaler().fit(X_train_tfidf)

In [None]:
scaler

In [None]:
X_train_tfidf_scaled = scaler.transform(X_train_tfidf)
X_test_tfidf_scaled = scaler.transform(X_test_tfidf)

In [None]:
print(X_train_tfidf_scaled)

# # K-Fold Cross Validation

In [None]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
kfold

# Logistic Regression

In [None]:
from luwiji.logistic_regression import illustration, demo

illustration.multinomial

In [None]:
demo.loss_curve()

In [None]:
demo.logistic_regression()

In [None]:
demo.loss_plane()

In [None]:
model_logreg = LogisticRegression(max_iter=5000)

# GridSearch CV

In [None]:
# Hyperparameter Tuning

penalty = ['l2']
C = np.linspace(0, 2, 20)
solver = ['lbfgs']

hyperparameters = dict(penalty=penalty,
                       C=C,
                       solver=solver)

grid = GridSearchCV(model_logreg,
                    hyperparameters,
                    cv=kfold,
                    n_jobs=-1,
                    verbose=200)

# MultiOutput Classifier

In [None]:
clf = MultiOutputClassifier(grid).fit(X_train_tfidf, y_train)

## fit logistic regression model

In [None]:
# # Validation Curve
# train_scores, valid_scores = validation_curve(clf,
#     X_test_tfidf,
#     y_train,
#     hyperparameters,
#     groups=None,
#     cv=kfold,
#     scoring="f1",
#     n_jobs=-1)

In [None]:
# # Learning Curve
# train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
#     clf,
#     X_train_tfidf_scaled,
#     y_train,
#     return_times=True)

In [None]:
# train_sizes, train_scores, test_scores, fit_times

In [None]:
# train_sizes.shape, train_scores.shape, test_scores.shape, fit_times.shape

In [None]:
# train_mean = np.mean(train_scores, axis=1)
# train_std = np.std(train_scores, axis=1)
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)

In [None]:
# plt.plot(train_sizes, train_mean, label = "Training Score")
# plt.plot(train_sizes, test_mean, label = "Cross-Validation Score")

# plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, color="#DDDDDD")
# plt.fill_between(train_sizes, test_mean-test_std, test_mean+test_std, color="#DDDDDD")

# plt.title("Learning Curve")
# plt.xlabel("Training Size")
# plt.ylabel("Accuracy Score")
# plt.legend(loc = "best")

In [None]:
# clf

# Metrics Classification

## hamming loss

In [None]:
y_pred = clf.predict(X_test_tfidf)

In [None]:
hamming_loss(y_test, y_pred)

## classification report

In [None]:
print(classification_report(y_test, y_pred,target_names=label_names))

## accuracy metrics

In [None]:
print("Accuracy Score Data Training", clf.score(X_train_tfidf, y_train))
print("Accuracy Score Data Testing", clf.score(X_test_tfidf, y_test))

In [None]:
score_lr = accuracy_score(y_pred, y_test)
print(score_lr)

# Save Model

In [None]:
model_path = "/home/dadandw/scrispy/build_model/skripsi/model/algo_model/model_logreg.jlib"

In [None]:
joblib.dump(clf, model_path)

In [None]:
test = pd.read_csv("../data_test/data_testing.csv", index_col=None)
test

In [None]:
test_tfidf = tfidf.transform(test.Tweet)

In [None]:
clf.predict(test_tfidf)

In [None]:
test[['HS', 'Abusive', 'Neutral']] = clf.predict(test_tfidf)
test

# Pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
lang_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=3000, ngram_range=(0,5)),
    ('scaler', MaxAbsScaler()))
])

In [None]:
lang_pipe.get_params

In [None]:
algo_pipe = Pipeline([
    ('model_logreg', LogisticRegression(multi_class = "multinomial", max_iter=5000)),
    ('grid', GridSearchCV(model_logreg,
                    hyperparameters,
                    cv=kfold,
                    n_jobs=-1,
                    verbose=200),
    ('clf', MultiOutputClassifier(grid)))
])

In [None]:
model = Pipeline([
    ('lang_pipe', lang_pipe),
    ('algo_pipe', algo_pipe)
])

In [None]:
model.fit(X_train_tfidf, y_train)