In [25]:
# Logistic Regression Model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the newly uploaded file containing labeled training data
file_path_with_labels = "HAO_comment_classification.json"
data_with_labels = pd.read_json(file_path_with_labels)

# Display the first few rows of the dataset to understand its structure
data_with_labels.head()

# Get the features and labels
X = data_with_labels["Comment Body"]
y = data_with_labels["Comment Classification"]

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the size of the training and test sets
X_train.shape, X_test.shape

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF features on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test set using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the transformed features
X_train_tfidf.shape, X_test_tfidf.shape

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(max_iter=10000, random_state=42)

# Train the model
logreg_model.fit(X_train_tfidf, y_train)

# Make predictions on the training and test sets
y_train_pred = logreg_model.predict(X_train_tfidf)
y_test_pred = logreg_model.predict(X_test_tfidf)

# Calculate accuracy on the training and test sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Display the accuracy on the training and test sets
train_accuracy, test_accuracy


(0.6417704011065007, 0.3370165745856354)

In [23]:
# Random Forest Model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Load the newly uploaded file containing labeled training data
file_path_with_labels = "HAO_comment_classification.json"
data_with_labels = pd.read_json(file_path_with_labels)

# Display the first few rows of the dataset to understand its structure
data_with_labels.head()

from sklearn.model_selection import train_test_split

# Get the features and labels
X = data_with_labels["Comment Body"]
y = data_with_labels["Comment Classification"]

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the size of the training and test sets
X_train.shape, X_test.shape

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF features on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test set using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the transformed features
X_train_tfidf.shape, X_test_tfidf.shape

# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(random_state=42)

# Train the model
random_forest_model.fit(X_train_tfidf, y_train)

# Make predictions on the training and test sets
y_train_rf_pred = random_forest_model.predict(X_train_tfidf)
y_test_rf_pred = random_forest_model.predict(X_test_tfidf)

# Calculate accuracy on the training and test sets
from sklearn.metrics import accuracy_score # Note: Include this import at the top of the code
train_rf_accuracy = accuracy_score(y_train, y_train_rf_pred)
test_rf_accuracy = accuracy_score(y_test, y_test_rf_pred)

train_rf_accuracy, test_rf_accuracy


(0.9930843706777317, 0.35359116022099446)

In [22]:
# Support Vector Machine (SVM) Model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Load the newly uploaded file containing labeled training data
file_path_with_labels = "HAO_comment_classification.json"
data_with_labels = pd.read_json(file_path_with_labels)

# Display the first few rows of the dataset to understand its structure
data_with_labels.head()

from sklearn.model_selection import train_test_split

# Get the features and labels
X = data_with_labels["Comment Body"]
y = data_with_labels["Comment Classification"]

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the size of the training and test sets
X_train.shape, X_test.shape

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF features on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test set using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the transformed features
X_train_tfidf.shape, X_test_tfidf.shape

# Initialize the Support Vector Machine model
svm_model = SVC(random_state=42)

# Train the model
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the training and test sets
y_train_svm_pred = svm_model.predict(X_train_tfidf)
y_test_svm_pred = svm_model.predict(X_test_tfidf)

# Calculate accuracy on the training and test sets
from sklearn.metrics import accuracy_score # Note: This import was missing in the original code
train_svm_accuracy = accuracy_score(y_train, y_train_svm_pred)
test_svm_accuracy = accuracy_score(y_test, y_test_svm_pred)

train_svm_accuracy, test_svm_accuracy


(0.8796680497925311, 0.30386740331491713)

In [21]:
# neural network model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from keras.optimizers import Adam
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

# Reading the data
file_path_with_labels = "HAO_comment_classification.json"
data_with_labels = pd.read_json(file_path_with_labels)

# Filtering out classes with only one sample
class_counts = data_with_labels["Comment Classification"].value_counts()
single_sample_classes = class_counts[class_counts == 1].index
data_with_labels_filtered = data_with_labels[~data_with_labels["Comment Classification"].isin(single_sample_classes)]

# Getting the features and labels
X = data_with_labels_filtered["Comment Body"]
y = data_with_labels_filtered["Comment Classification"]

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the data
X_temp, X_test, y_temp_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X_temp, y_temp_encoded, test_size=0.2, random_state=42, stratify=y_temp_encoded)

# Vectorizing the features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_val_tfidf = tfidf_vectorizer.transform(X_val).toarray()

# Converting integer labels to one-hot encoding
y_train_one_hot = to_categorical(y_train_encoded)
y_val_one_hot = to_categorical(y_val_encoded)

# Defining the custom learning rate
learning_rate = 0.001

# Defining the optimizer with the custom learning rate
optimizer_with_custom_lr = Adam(learning_rate=learning_rate)

# Defining the neural network model with the previous configuration
dropout_rate = 0.3
model_with_custom_lr = Sequential([
    Dense(256, input_shape=(X_train_tfidf.shape[1],), activation='relu', kernel_regularizer=l2(0.00001)),
    Dropout(dropout_rate),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compiling the model with the custom optimizer
model_with_custom_lr.compile(optimizer=optimizer_with_custom_lr, loss='categorical_crossentropy', metrics=['accuracy'])

# Training the model
history_with_custom_lr = model_with_custom_lr.fit(X_train_tfidf, y_train_one_hot, epochs=20, batch_size=32, validation_data=(X_val_tfidf, y_val_one_hot), verbose=1)

# Evaluating the model
train_accuracy_with_custom_lr = model_with_custom_lr.evaluate(X_train_tfidf, y_train_one_hot, verbose=0)[1]
val_accuracy_with_custom_lr = model_with_custom_lr.evaluate(X_val_tfidf, y_val_one_hot, verbose=0)[1]

train_accuracy_with_custom_lr, val_accuracy_with_custom_lr


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


(0.9895104765892029, 0.3986014127731323)