In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Reshape, Conv1D, MaxPooling1D, LSTM, GlobalMaxPooling1D, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler
import nltk
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [11]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Set Working Directory
%cd /content/drive/MyDrive/CSE6242
#Load Data
data = pd.read_csv("combined_reviews.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Project11_FakeNewsDetection


In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
# Preprocess Data (Tokenization, etc.)
def preprocess_text(text):
    return word_tokenize(text.lower())

In [15]:
data['processed_reviewContent'] = data['reviewContent'].apply(preprocess_text)

In [16]:
# Create Doc2Vec Vectors
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data['processed_reviewContent'])]
doc2vec_model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)
doc2vec_vectors = np.array([doc2vec_model.infer_vector(doc.words) for doc in documents])

In [36]:
# Save Doc2Vec model
doc2vec_model.save('doc2vec_model_path')

In [17]:
# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectors = tfidf_vectorizer.fit_transform([' '.join(doc) for doc in data['processed_reviewContent']]).toarray()

In [33]:
import pickle
# Save TF-IDF Vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

In [18]:
# Combine Doc2Vec and TF-IDF Features
combined_features = np.hstack((doc2vec_vectors, tfidf_vectors))

In [19]:
# Scale Features
scaler = StandardScaler()
X = scaler.fit_transform(combined_features)
y = data['label'].values

In [34]:
# Save StandardScaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [20]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Define Model Architecture
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Reshape((128, 1)))  # Reshape for Conv1D
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100, return_sequences=True))
    model.add(GlobalMaxPooling1D())
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

In [23]:
# Initialize the Model
model = create_model(input_dim=X_train.shape[1])

In [26]:
# Callbacks
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, mode='max')


In [27]:
# Train the Model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    verbose=1,
    validation_split=0.2,
    callbacks=[checkpoint, early_stopping]
)

Epoch 1/10
Epoch 2/10
  16/1137 [..............................] - ETA: 7s - loss: 0.3152 - accuracy: 0.8691

  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [28]:
# Evaluate the Model
scores = model.evaluate(X_test, y_test, verbose=0)
print(f'Test accuracy: {scores[1]*100:.2f}%')

Test accuracy: 82.22%


In [29]:
# Classification Report
y_pred = model.predict(X_test).round()
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.82      0.82      5599
           1       0.83      0.82      0.82      5765

    accuracy                           0.82     11364
   macro avg       0.82      0.82      0.82     11364
weighted avg       0.82      0.82      0.82     11364



In [30]:
# Save model
model.save('text_classification_model.h5')

  saving_api.save_model(
