In [1]:
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
import pickle
import os
import nltk

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Set Working Directory
%cd /content/drive/MyDrive/Project11_FakeNewsDetection
nltk.download('punkt')


Mounted at /content/drive
/content/drive/MyDrive/Project11_FakeNewsDetection


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# Load the trained Doc2Vec, TF-IDF, StandardScaler, and model
doc2vec_model = Doc2Vec.load('doc2vec_model_path')
with open('tfidf_vectorizer.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)
with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)
model = load_model('text_classification_model.h5')

In [4]:
# Preprocess text function
def preprocess_text(text):
    if isinstance(text, str):
        return word_tokenize(text.lower())
    else:
        return []

In [5]:
def predict_label(text):
    if pd.isna(text) or not isinstance(text, str):
        return 0  # Mark as real if it's not text
    processed_text = preprocess_text(text)
    doc_vec = doc2vec_model.infer_vector(processed_text)
    tfidf_vec = tfidf_vectorizer.transform([' '.join(processed_text)]).toarray()[0]
    combined_features = np.hstack((doc_vec, tfidf_vec))
    scaled_features = scaler.transform([combined_features])
    prediction = model.predict(scaled_features, verbose=0).round().astype(int).flatten()[0]
    return prediction


In [6]:
#Load Data
df = pd.read_csv("updated_filtered_atlanta_restaurant_reviews.csv")

In [7]:
from tqdm import tqdm
# Split dataframe into chunks
chunk_size = 1000
chunks = np.array_split(df, len(df) // chunk_size)

# # Create 'data' directory if it doesn't exist
data_directory = "data_new_lstm_2"
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

# # Process each chunk and save
# processed_chunks = []
# for i, chunk in enumerate(tqdm(chunks, desc="Processing Chunks"), start=1):
#     processed_chunk = chunk.copy()
#     processed_chunk['predicted_label'] = processed_chunk['text'].apply(predict_label)

#     # Save processed chunk to CSV
#     filename = os.path.join(data_directory, f"{i}.csv")
#     processed_chunk.to_csv(filename, index=False)
#     processed_chunks.append(processed_chunk)

#     # Print the filename of the saved chunk
#     print(f"Chunk {i} saved as {filename}")

# # Concatenate all processed chunks and save the final file
# final_df = pd.concat(processed_chunks)
# final_filename = os.path.join(data_directory, "complete_file.csv")
# final_df.to_csv(final_filename, index=False)
# print(f"Final complete file saved as {final_filename}")

# Start processing from chunk 323
start_chunk_index = 740
processed_chunks = []

for i, chunk in enumerate(tqdm(chunks[start_chunk_index:], desc="Processing Chunks"), start=start_chunk_index + 1):
    processed_chunk = chunk.copy()
    processed_chunk['predicted_label'] = processed_chunk['text'].apply(predict_label)

    filename = os.path.join(data_directory, f"{i}.csv")
    if not os.path.exists(filename):
        processed_chunk.to_csv(filename, index=False)
        print(f"Chunk {i} saved as {filename}")
    else:
        print(f"Chunk {i} already processed. Skipping.")
    processed_chunks.append(processed_chunk)

# Concatenate all newly processed chunks with previously processed ones
new_processed_chunks = pd.concat(processed_chunks)
existing_processed_chunks = pd.concat([pd.read_csv(os.path.join(data_directory, f"{j}.csv")) for j in range(1, start_chunk_index + 1)])
final_df = pd.concat([existing_processed_chunks, new_processed_chunks])

# Save the final complete file
final_filename = os.path.join(data_directory, "complete_file.csv")
final_df.to_csv(final_filename, index=False)

print(f"Final complete file saved as {final_filename}")

Processing Chunks:   5%|▌         | 1/20 [00:43<13:55, 43.95s/it]

Chunk 741 saved as data_new_lstm_2/741.csv


Processing Chunks:  10%|█         | 2/20 [01:20<11:49, 39.43s/it]

Chunk 742 saved as data_new_lstm_2/742.csv


Processing Chunks:  15%|█▌        | 3/20 [01:37<08:19, 29.38s/it]

Chunk 743 saved as data_new_lstm_2/743.csv


Processing Chunks:  20%|██        | 4/20 [02:31<10:27, 39.23s/it]

Chunk 744 saved as data_new_lstm_2/744.csv


Processing Chunks:  25%|██▌       | 5/20 [02:55<08:22, 33.47s/it]

Chunk 745 saved as data_new_lstm_2/745.csv


Processing Chunks:  30%|███       | 6/20 [03:23<07:23, 31.65s/it]

Chunk 746 saved as data_new_lstm_2/746.csv


Processing Chunks:  35%|███▌      | 7/20 [04:16<08:21, 38.57s/it]

Chunk 747 saved as data_new_lstm_2/747.csv


Processing Chunks:  40%|████      | 8/20 [04:59<07:59, 39.99s/it]

Chunk 748 saved as data_new_lstm_2/748.csv


Processing Chunks:  45%|████▌     | 9/20 [05:08<05:34, 30.41s/it]

Chunk 749 saved as data_new_lstm_2/749.csv


Processing Chunks:  50%|█████     | 10/20 [06:02<06:15, 37.53s/it]

Chunk 750 saved as data_new_lstm_2/750.csv


Processing Chunks:  55%|█████▌    | 11/20 [06:25<04:59, 33.27s/it]

Chunk 751 saved as data_new_lstm_2/751.csv


Processing Chunks:  60%|██████    | 12/20 [07:03<04:36, 34.59s/it]

Chunk 752 saved as data_new_lstm_2/752.csv


Processing Chunks:  65%|██████▌   | 13/20 [07:39<04:06, 35.17s/it]

Chunk 753 saved as data_new_lstm_2/753.csv


Processing Chunks:  70%|███████   | 14/20 [08:09<03:21, 33.56s/it]

Chunk 754 saved as data_new_lstm_2/754.csv


Processing Chunks:  75%|███████▌  | 15/20 [08:35<02:35, 31.11s/it]

Chunk 755 saved as data_new_lstm_2/755.csv


Processing Chunks:  80%|████████  | 16/20 [09:06<02:05, 31.33s/it]

Chunk 756 saved as data_new_lstm_2/756.csv


Processing Chunks:  85%|████████▌ | 17/20 [09:25<01:22, 27.58s/it]

Chunk 757 saved as data_new_lstm_2/757.csv


Processing Chunks:  90%|█████████ | 18/20 [09:57<00:57, 28.89s/it]

Chunk 758 saved as data_new_lstm_2/758.csv


Processing Chunks:  95%|█████████▌| 19/20 [10:18<00:26, 26.55s/it]

Chunk 759 saved as data_new_lstm_2/759.csv


Processing Chunks: 100%|██████████| 20/20 [10:41<00:00, 32.06s/it]

Chunk 760 saved as data_new_lstm_2/760.csv





Final complete file saved as data_new_lstm_2/complete_file.csv
