In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix
from bs4 import BeautifulSoup
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D
import re
import string

In [2]:
# Read in data from CSV files
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")

In [3]:
# Drop subject/date and concatenating text and title
train["text"] = train["title"] + " " + train["text"]
test["text"] = test["title"] + " " + test["text"]

train = train.drop(["subject", "date", "title"], axis = 1)
test = test.drop(["subject", "date", "title"], axis = 1)

In [4]:
def clean_text_data(data_point):
    review_soup = BeautifulSoup(data_point)
    review_text = review_soup.get_text()
    review_letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    review_lower_case = review_letters_only.lower()  
    review_words = review_lower_case.split() 
    stop_words = stopwords.words("english")
    meaningful_words = [x for x in review_words if x not in stop_words]
        
    return(" ".join(meaningful_words)) 

train["text"] = train["text"].apply(clean_text_data)
test["text"] = test["text"].apply(clean_text_data)
data = pd.concat([train, test], ignore_index=True)




In [5]:
#print(train["text"].iloc[1])

In [6]:
# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(train["text"], train["label"], test_size=0.2, random_state=42)

# Convert train/test data to vectors
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) 

# Convert the text into bag-of-words features
vectorizer = CountVectorizer()
vectorizer.fit(data["text"])
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

# Convert the sparse matrices to dense numpy arrays and add an extra dimension
X_train_array = np.expand_dims(X_train.toarray(), axis=-1)
X_val_array = np.expand_dims(X_val.toarray(), axis=-1)

# Convert the labels to numpy arrays
Y_train = Y_train.to_numpy()
Y_val = Y_val.to_numpy()

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_val = le.transform(Y_val)

In [10]:
# Define the input shape
input_shape = X_train_array.shape[1:]

# Define the number of classes
num_classes = len(np.unique(Y_train))

# Build the CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), 
                       tf.keras.metrics.Recall(name='recall')])

# Train the model
model.fit(X_train_array, Y_train, validation_data=(X_val_array, Y_val), epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2778011e460>

In [11]:
X_test = test["text"]
X_test = vectorizer.transform(X_test)
y_test = test['label']
y_test = le.transform(y_test)
# Reshape the array
X_test = np.expand_dims(X_test.toarray(), axis=-1)
# Evaluate the CNN model on the testing data
test_loss, test_acc, test_precision, test_recall = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Test accuracy: 0.9976000189781189


In [13]:
f1_score = 2 * (test_precision * test_recall) / (test_precision + test_recall)
print(f1_score)

0.9975951780962218
