## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import necessary libraries

In [None]:
!pip install tabulate
from tabulate import tabulate
import requests
import pandas as pd
import time
import json
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from tabulate import tabulate
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

## Removal of common stopwords

In [None]:
# Define file paths
input_file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned.csv"
output_file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned_no_stopwords.csv"

# Load the dataset
df = pd.read_csv(input_file_path)

# Download stopwords if not already available
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Remove common stopwords from 'Cleaned Post'
df["Cleaned Post Without Stopwords"] = df["Cleaned Post"].astype(str).apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

# Select columns to keep
columns_to_keep = [
    "Post ID", "Title", "Tags", "Created At", "Last Activity",
    "Views", "Replies", "Participants", "Cleaned Post Without Stopwords"
]

# Save the dataset with stopwords removed
df[columns_to_keep].to_csv(output_file_path, index=False)

# Reload for display
df_preview = pd.read_csv(output_file_path)

print(f"Cleaned dataset without stopwords saved to: {output_file_path}")
print(tabulate(df_preview.head(5), headers='keys', tablefmt='fancy_grid'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Cleaned dataset without stopwords saved to: /content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned_no_stopwords.csv
╒════╤═══════════╤═════════════════════════════════════════════════════════════════╤═════════════════════════════════════╤══════════════════════════════════╤══════════════════════════════════╤═════════╤═══════════╤════════════════╤═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════

## List of top 50 common words

In [None]:
# Load the cleaned dataset
file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned_no_stopwords.csv"
df = pd.read_csv(file_path)

# Tokenize and count word frequency from 'Cleaned Post Without Stopwords'
word_list = " ".join(df["Cleaned Post Without Stopwords"]).split()
word_counts = Counter(word_list)

# Show the top 50 most common words
common_words = word_counts.most_common(50)
common_words_df = pd.DataFrame(common_words, columns=["Word", "Frequency"])

print("Top 50 Most Frequent Words:\n")
print(tabulate(common_words_df, headers='keys', tablefmt='fancy_grid'))

Top 50 Most Frequent Words:

╒════╤═════════════╤═════════════╕
│    │ Word        │   Frequency │
╞════╪═════════════╪═════════════╡
│  0 │ prompt      │        3529 │
├────┼─────────────┼─────────────┤
│  1 │ gpt         │        2010 │
├────┼─────────────┼─────────────┤
│  2 │ like        │        1716 │
├────┼─────────────┼─────────────┤
│  3 │ use         │        1657 │
├────┼─────────────┼─────────────┤
│  4 │ ai          │        1647 │
├────┼─────────────┼─────────────┤
│  5 │ im          │        1621 │
├────┼─────────────┼─────────────┤
│  6 │ using       │        1491 │
├────┼─────────────┼─────────────┤
│  7 │ user        │        1420 │
├────┼─────────────┼─────────────┤
│  8 │ model       │        1364 │
├────┼─────────────┼─────────────┤
│  9 │ text        │        1354 │
├────┼─────────────┼─────────────┤
│ 10 │ would       │        1340 │
├────┼─────────────┼─────────────┤
│ 11 │ get         │        1175 │
├────┼─────────────┼─────────────┤
│ 12 │ chatgpt     │      

## Removal of custom stopwords

In [None]:
# Define file paths
input_file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned_no_stopwords.csv"
output_file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned_custom_stopwords.csv"

# Load the dataset
df = pd.read_csv(input_file_path)

# Define custom domain-specific stopwords
custom_stopwords = {
    "openai", "chatgpt", "gpt", "ai", "api", "im", "ive", "dalle"
}

# Remove custom domain-specific stopwords
df["Cleaned Post Without Custom Stopwords"] = df["Cleaned Post Without Stopwords"].astype(str).apply(
    lambda x: " ".join([word for word in x.split() if word not in custom_stopwords])
)

# Tokenize and count word frequency
word_list = " ".join(df["Cleaned Post Without Custom Stopwords"]).split()
word_counts = Counter(word_list)
common_words_df = pd.DataFrame(word_counts.most_common(50), columns=["Word", "Frequency"])

# Select only relevant columns (including metadata + new cleaned post)
columns_to_keep = [
    "Post ID", "Title", "Tags", "Created At", "Last Activity",
    "Views", "Replies", "Participants", "Cleaned Post Without Custom Stopwords"
]

# Save the trimmed and updated DataFrame
df[columns_to_keep].to_csv(output_file_path, index=False)

# Preview the result
df_preview = pd.read_csv(output_file_path)
print(f"Final cleaned dataset saved to: {output_file_path}")
print(tabulate(df_preview.head(5), headers='keys', tablefmt='fancy_grid'))

Final cleaned dataset saved to: /content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned_custom_stopwords.csv
╒════╤═══════════╤═════════════════════════════════════════════════════════════════╤═════════════════════════════════════╤══════════════════════════════════╤══════════════════════════════════╤═════════╤═══════════╤════════════════╤═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════

## Applying lemmatization

In [None]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from tabulate import tabulate

# Define file paths
input_file_path = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_cleaned_custom_stopwords.csv"
output_file_path_lemmatized = "/content/drive/MyDrive/SW_PROJECT/openai_prompting_lemmatized.csv"

# Load the dataset
df = pd.read_csv(input_file_path)

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatization function
def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply lemmatization to the cleaned text
df["Lemmatized Post"] = df["Cleaned Post Without Custom Stopwords"].astype(str).apply(lemmatize_text)

# Select relevant columns including metadata and lemmatized post
columns_to_keep = [
    "Post ID", "Title", "Tags", "Created At", "Last Activity",
    "Views", "Replies", "Participants", "Lemmatized Post"
]

# Save the full DataFrame with metadata + lemmatized text
df[columns_to_keep].to_csv(output_file_path_lemmatized, index=False)

# Preview
df_preview = pd.read_csv(output_file_path_lemmatized)
print(f"Final lemmatized dataset saved to: {output_file_path_lemmatized}")
print(tabulate(df_preview.head(5), headers='keys', tablefmt='fancy_grid'))

Final lemmatized dataset saved to: /content/drive/MyDrive/SW_PROJECT/openai_prompting_lemmatized.csv
╒════╤═══════════╤═════════════════════════════════════════════════════════════════╤═════════════════════════════════════╤══════════════════════════════════╤══════════════════════════════════╤═════════╤═══════════╤════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════