# Load Libraries


In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, BatchNormalization
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, ReLU
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import to_categorical

import os

# Load Data


In [None]:
for dirname, _, filenames in os.walk("input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def load_dataset(filepath):
    data = []
    with open(filepath) as f:
        lines = f.readlines()
        for line in lines:
            data.append(line.strip().split(";"))
    return pd.DataFrame(data, columns=["text", "label"])

In [None]:
train_data = load_dataset("input/train.txt")
validation_data = load_dataset("input/val.txt")
test_data = load_dataset("input/test.txt")

In [None]:
df = pd.concat([train_data, validation_data, test_data])
df.head()

In [None]:
df.shape

# Explore Data


In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
df["label"].unique()

In [None]:
df.label.value_counts()

In [None]:
df["label"].value_counts().plot(
    kind="bar", color=["green", "gray", "pink", "black", "red", "orange"]
)
plt.xlabel("Labels")
plt.ylabel("Number of samples")
plt.show()

In [None]:
df["length"] = df.text.apply(lambda x: len(x))
fig = plt.figure(figsize=(10, 6))
sns.kdeplot(x=df["length"], hue=df["label"])
plt.show()

In [None]:
stopwords = set(nltk.corpus.stopwords.words("english"))
labels = df["label"].unique()
for label in labels:
    text = " ".join(df[df["label"] == label]["text"])
    wordcloud = WordCloud(
        width=800,
        height=800,
        background_color="white",
        stopwords=stopwords,
        min_font_size=10,
    ).generate(text)
    plt.figure(figsize=(4, 4), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.title(label)
    plt.show()

# Models


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=44
)

In [None]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

## Random Forest


In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=100)
rf.fit(X_train_cv, y_train)

In [None]:
y_pred_rf = rf.predict(X_test_cv)

In [None]:
report_rf = classification_report(y_test, y_pred_rf)
print("Classification report of Random Forest classifier:\n", report_rf)

## Logistic Regression


In [None]:
lr = LogisticRegression(max_iter=1000, random_state=100)
lr.fit(X_train_cv, y_train)

In [None]:
y_pred_lr = lr.predict(X_test_cv)

In [None]:
report_lr = classification_report(y_test, y_pred_lr)
print("Classification report of Logistic Regression (Multi-Class):\n", report_lr)

## Naive Bayes


In [None]:
nb = MultinomialNB()
nb.fit(X_train_cv, y_train)

In [None]:
y_pred_nb = nb.predict(X_test_cv)

In [None]:
report_nb = classification_report(y_test, y_pred_nb)
print("Classification report of Multinomial Naive Bayes:\n", report_nb)

## SVM


In [None]:
svm = LinearSVC(random_state=100, max_iter=2000, dual=True)
svm.fit(X_train_cv, y_train)

In [None]:
y_pred_svm = svm.predict(X_test_cv)

In [None]:
report_svm = classification_report(y_test, y_pred_svm)
print("Classification report of Linear SVM:\n", report_svm)

## LSTM


In [None]:
le = LabelEncoder()
y_train_lstm = le.fit_transform(y_train)
y_test_lstm = le.transform(y_test)

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
max_len = max(len(seq) for seq in X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

In [None]:
model = Sequential()
model.add(Input(shape=(max_len,)))
model.add(Embedding(input_dim=5000, output_dim=32))
model.add(LSTM(32))
model.add(Dense(len(labels), activation="softmax"))
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
model.summary()

In [None]:
early_stop = EarlyStopping(monitor="val_loss", patience=3)
history = model.fit(
    X_train_padded,
    y_train_lstm,
    epochs=25,
    validation_split=0.2,
    callbacks=[early_stop],
)

In [None]:
y_pred = model.predict(X_test_padded)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test_lstm, y_pred)
report = classification_report(
    y_test_lstm, y_pred, target_names=labels, zero_division=0
)
print("Accuracy:", accuracy)
print("Classification report of LSTM:\n", report)

## CNN


In [None]:
le = LabelEncoder()
y_train_cnn = le.fit_transform(y_train)
y_train_cnn = to_categorical(y_train_cnn)
y_test_cnn = le.transform(y_test)
y_test_cnn = to_categorical(y_test_cnn)

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
max_len = max(len(seq) for seq in X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

In [None]:
model = Sequential()
model.add(Input(shape=(max_len,)))
model.add(Embedding(input_dim=5000, output_dim=32))
model.add(Conv1D(64, 5, padding="same", activation="relu"))
model.add(BatchNormalization())
model.add(ReLU())
model.add(Dropout(0.5))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(len(labels), activation="softmax"))

model.compile(
    optimizer="adamax",
    loss="categorical_crossentropy",
    metrics=["accuracy", Precision(), Recall()],
)
model.summary()

In [None]:
history = model.fit(X_train_padded, y_train_cnn, epochs=25, validation_split=0.2)

In [None]:
y_pred = model.predict(X_test_padded)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_cnn, axis=1)
accuracy = accuracy_score(y_test_labels, y_pred_labels)
report = classification_report(
    y_test_labels, y_pred_labels, target_names=labels, zero_division=0
)

print("Accuracy:", accuracy)
print("Classification report of CNN:\n", report)