## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import necessary libraries

In [None]:
!pip install tabulate
from tabulate import tabulate
import requests
import pandas as pd
import time
import json
import re



In [None]:
file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_merged.csv"
df = pd.read_csv(file_path)

## Merging post title with post body

In [None]:
# Define file path for the combined dataset
combined_text_file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_combined.csv"

# Create new column 'Post' by combining Title and Post Body
df["Post"] = df["Title"].fillna("") + " " + df["Post Body"].fillna("")

# Drop rows where 'Post' is empty
df = df[df["Post"].str.strip() != ""]

# Reset index
df.reset_index(drop=True, inplace=True)

# Select only required columns
columns_to_keep = ["Post ID", "Title", "Tags", "Created At", "Last Activity", "Views", "Replies", "Participants", "Post"]
df_final = df[columns_to_keep]

# Save to CSV
df_final.to_csv(combined_text_file_path, index=False)

# Reload for verification
df_processed = pd.read_csv(combined_text_file_path)

print(f"Dataset now contains {len(df_final)} posts after merging and filtering!")
print(f"Combined dataset saved at: {combined_text_file_path}")
print(tabulate(df_processed.head(5), headers='keys', tablefmt='fancy_grid'))

Dataset now contains 2693 posts after merging and filtering!
Combined dataset saved at: /content/drive/MyDrive/SW_PROJECT/openai_prompting_combined.csv
╒════╤═══════════╤═════════════════════════════════════════════════════════════════╤═════════════════════════════════════╤══════════════════════════════════╤══════════════════════════════════╤═════════╤═══════════╤════════════════╤═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════

## Remove HTML tags, special symbols and character, URLS, punctuations, numbers

In [None]:
# Define file path for the cleaned dataset
cleaned_text_file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned.csv"

# Define text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\\S+", "", text)  # Remove URLs
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation & special characters
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = " ".join(text.split())  # Remove extra whitespace
    return text

# Clean the 'Post' column
df["Cleaned Post"] = df["Post"].apply(clean_text)

# Define final columns (all original + cleaned post)
columns_to_keep = ["Post ID", "Title", "Tags", "Created At", "Last Activity", "Views", "Replies", "Participants", "Cleaned Post"]
df_final = df[columns_to_keep]

# Save to new CSV
df_final.to_csv(cleaned_text_file_path, index=False)

# Reload for verification
df_cleaned = pd.read_csv(cleaned_text_file_path)

print(f"Dataset now contains {len(df_final)} posts after cleaning!")
print(f"Cleaned dataset saved at: {cleaned_text_file_path}")
print("Removed HTML tags, punctuation, special characters, and numbers from post content!")
print(tabulate(df_cleaned.head(5), headers='keys', tablefmt='fancy_grid'))

Dataset now contains 2693 posts after cleaning!
Cleaned dataset saved at: /content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned.csv
Removed HTML tags, punctuation, special characters, and numbers from post content!
╒════╤═══════════╤═════════════════════════════════════════════════════════════════╤═════════════════════════════════════╤══════════════════════════════════╤══════════════════════════════════╤═════════╤═══════════╤════════════════╤═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════