In [128]:
# Logistic Regression Model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the newly uploaded file containing labeled training data
file_path_with_labels = "HAO_comment_classification.json"
data_with_labels = pd.read_json(file_path_with_labels)

# Display the first few rows of the dataset to understand its structure
data_with_labels.head()

# Get the features and labels
X = data_with_labels["Comment Body"]
y = data_with_labels["Comment Classification"]

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=358)

# Display the size of the training and test sets
X_train.shape, X_test.shape

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF features on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test set using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the transformed features
X_train_tfidf.shape, X_test_tfidf.shape

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(max_iter=10000, random_state=358)

# Train the model
logreg_model.fit(X_train_tfidf, y_train)

# Make predictions on the training and test sets
y_train_pred = logreg_model.predict(X_train_tfidf)
y_test_pred = logreg_model.predict(X_test_tfidf)

# Calculate accuracy on the training and test sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Display the accuracy on the training and test sets
train_accuracy, test_accuracy


(0.627939142461964, 0.35911602209944754)

In [129]:
# Random Forest Model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Load the newly uploaded file containing labeled training data
file_path_with_labels = "HAO_comment_classification.json"
data_with_labels = pd.read_json(file_path_with_labels)

# Display the first few rows of the dataset to understand its structure
data_with_labels.head()

from sklearn.model_selection import train_test_split

# Get the features and labels
X = data_with_labels["Comment Body"]
y = data_with_labels["Comment Classification"]

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=188)

# Display the size of the training and test sets
X_train.shape, X_test.shape

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF features on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test set using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the transformed features
X_train_tfidf.shape, X_test_tfidf.shape

# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(random_state=188)

# Train the model
random_forest_model.fit(X_train_tfidf, y_train)

# Make predictions on the training and test sets
y_train_rf_pred = random_forest_model.predict(X_train_tfidf)
y_test_rf_pred = random_forest_model.predict(X_test_tfidf)

# Calculate accuracy on the training and test sets
from sklearn.metrics import accuracy_score # Note: Include this import at the top of the code
train_rf_accuracy = accuracy_score(y_train, y_train_rf_pred)
test_rf_accuracy = accuracy_score(y_test, y_test_rf_pred)

train_rf_accuracy, test_rf_accuracy


(0.9930843706777317, 0.4419889502762431)

In [130]:
# Support Vector Machine (SVM) Model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Load the newly uploaded file containing labeled training data
file_path_with_labels = "HAO_comment_classification.json"
data_with_labels = pd.read_json(file_path_with_labels)

# Display the first few rows of the dataset to understand its structure
data_with_labels.head()

from sklearn.model_selection import train_test_split

# Get the features and labels
X = data_with_labels["Comment Body"]
y = data_with_labels["Comment Classification"]

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=196)

# Display the size of the training and test sets
X_train.shape, X_test.shape

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF features on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test set using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shape of the transformed features
X_train_tfidf.shape, X_test_tfidf.shape

# Initialize the Support Vector Machine model
svm_model = SVC(random_state=196)

# Train the model
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the training and test sets
y_train_svm_pred = svm_model.predict(X_train_tfidf)
y_test_svm_pred = svm_model.predict(X_test_tfidf)

# Calculate accuracy on the training and test sets
from sklearn.metrics import accuracy_score # Note: This import was missing in the original code
train_svm_accuracy = accuracy_score(y_train, y_train_svm_pred)
test_svm_accuracy = accuracy_score(y_test, y_test_svm_pred)

train_svm_accuracy, test_svm_accuracy


(0.8879668049792531, 0.34806629834254144)