In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv("train.csv")
data

Unnamed: 0,Id,Comment,Topic
0,0x840,A few things. You might have negative- frequen...,Biology
1,0xbf0,Is it so hard to believe that there exist part...,Physics
2,0x1dfc,There are bees,Biology
3,0xc7e,I'm a medication technician. And that's alot o...,Biology
4,0xbba,Cesium is such a pretty metal.,Chemistry
...,...,...,...
8690,0x1e02,I make similar observations over the last week...,Biology
8691,0xc8d,You would know.,Biology
8692,0x723,Also use the correct number of sig figs,Chemistry
8693,0x667,"What about the ethical delimmas, groundbreaki...",Biology


In [3]:
def drop_data(data:pd.DataFrame, columns:list):
  data_copy = data.copy()
  data_copy.drop(columns, axis=1, inplace=True)
  return data_copy

In [4]:
data = drop_data(data,['Id'])

In [5]:
def label_data(data:pd.DataFrame, columns:list):
  encoder = LabelEncoder()
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = encoder.fit_transform(data_copy[[column]].astype(str).values.ravel())
  return data_copy

In [6]:
data = label_data(data,['Topic'])

In [7]:
data.head()

Unnamed: 0,Comment,Topic
0,A few things. You might have negative- frequen...,0
1,Is it so hard to believe that there exist part...,2
2,There are bees,0
3,I'm a medication technician. And that's alot o...,0
4,Cesium is such a pretty metal.,1


In [8]:
X = list(data['Comment'])
y = data['Topic']

In [9]:
tfidf_vectorizer = TfidfVectorizer(
    binary=True,
    norm=None,
    use_idf=False,
    smooth_idf=False,
    lowercase=True,
    stop_words='english',
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    min_df=1,
    max_df=1.0,
    max_features=None,
    ngram_range=(1, 1)
)
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(X)
X = tfidf_vectorizer_vectors.toarray()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
model2 = Sequential([
    Input(shape=(X.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(4, activation='softmax')
])

In [12]:
model2.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [13]:
history = model2.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=8)

Epoch 1/10
[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 38ms/step - accuracy: 0.5144 - loss: 1.0220 - val_accuracy: 0.6765 - val_loss: 0.6813
Epoch 2/10
[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 39ms/step - accuracy: 0.8598 - loss: 0.3754 - val_accuracy: 0.6995 - val_loss: 0.7540
Epoch 3/10
[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 35ms/step - accuracy: 0.9293 - loss: 0.1729 - val_accuracy: 0.6962 - val_loss: 1.0034
Epoch 4/10
[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 34ms/step - accuracy: 0.9567 - loss: 0.1052 - val_accuracy: 0.6929 - val_loss: 1.2200
Epoch 5/10
[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 35ms/step - accuracy: 0.9724 - loss: 0.0668 - val_accuracy: 0.6946 - val_loss: 1.3040
Epoch 6/10
[1m685/685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 34ms/step - accuracy: 0.9728 - loss: 0.0630 - val_accuracy: 0.7011 - val_loss: 1.6088
Epoch 7/10
[1m6

In [23]:
y_pred = [probs.tolist().index(max(probs)) for probs in model2.predict(X_test)]
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred, average='micro'))

[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
              precision    recall  f1-score   support

           0       0.72      0.73      0.72      1088
           1       0.64      0.66      0.65       889
           2       0.67      0.63      0.65       632

    accuracy                           0.68      2609
   macro avg       0.68      0.67      0.67      2609
weighted avg       0.68      0.68      0.68      2609

0.6803372939823688
