<a href="https://colab.research.google.com/github/giannisalinetti/dwllm/blob/main/Dataset_wedding_reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Subreddit Dataset generator for BidBudgetBrides

## Scrape data from Reddit

In [None]:
pip install praw

In [7]:
import praw
import pandas as pd

In [None]:
from google.colab import userdata
import time

reddit = praw.Reddit(
    check_for_async = False,
    client_id = userdata.get('reddit_client_id'),
    client_secret = userdata.get('reddit_client_secret'),
    user_agent = "dataset_scraper/1.0",
)

subreddit_name = "BigBudgetBrides"

def fetch_hot_posts(subreddit_name, retries=3):
    """
    Fetches hot posts from a given subreddit and their comments, with error handling.

    Args:
        subreddit_name (str): The name of the subreddit.
        retries (int): The number of retries for API requests.
    """

    posts = []
    for attempt in range(retries):
        try:
            for post in reddit.subreddit(subreddit_name).hot(limit=None):
                post.comments.replace_more(limit=None)

                for comment in post.comments:
                    posts.append([post.title,
                                  post.score,
                                  post.url,
                                  post.num_comments,
                                  post.selftext,
                                  comment.body,
                                  comment.score,
                                  comment.created_utc
                    ])
                time.sleep(2)
            break
        except praw.exceptions.APIException as e:
            if "429" in str(e):
                wait_time = 60  # Wait 1 minute before retrying
                print(f"Rate limit hit! Waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Unexpected error: {e}")
                break  # Stop if another error occurs
    return posts

retrieved_posts = fetch_hot_posts(subreddit_name)


## Create the dataframe and cleanup

In [None]:
# Create a dataframe
df = pd.DataFrame(retrieved_posts, columns=["title",
                                  "score",
                                  "url",
                                  "num_comments",
                                  "selftext",
                                  "comment_body",
                                  "comment_score",
                                  "comment_timestamp"])


# Drop duplicate comments
df.drop_duplicates(subset=["comment_body"], inplace=True)

# Drop missing values
df.dropna(inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)


In [None]:
# Normalize text data
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize_text(text):
  text = str(text).lower()
  text = re.sub(r"http\S+|www.\S+", "", text) # Remove URLs
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # Remove special characters

  tokens = word_tokenize(text)
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # Remove stopwords and lemmatize words
  cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
  ]

  # Reconstruct sentence
  return " ".join(cleaned_tokens)


# Apply cleaning function
df["comment_body_clean"] = df["comment_body"].apply(clean_and_lemmatize_text)

# Convert timestamps to readable format
from datetime import datetime
df["comment_timestamp"] = pd.to_datetime(df["comment_timestamp"], unit="s")


## Save local CSV

In [None]:
# Save to CSV
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/My Drive/datasets/big_budget_brides.csv"

df.to_csv(save_path, index=False)

## Convert to HuggingFace dataset format

In [None]:
# Install required packages
!pip install datasets transformers

In [None]:
from datasets import Dataset

df = pd.read_csv("/content/drive/My Drive/datasets/big_budget_brides.csv")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Save as Hugging Face dataset format
dataset.save_to_disk("/content/drive/My Drive/datasets/big_budget_brides_hf")
print("Dataset saved successfully!")


## Push to HuggingFace

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import HfApi, HfFolder
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

# Save the token to use it for authentication
HfFolder.save_token(HF_TOKEN)

# Initialize API with token
api = HfApi(token=HF_TOKEN)


In [None]:
# Push dataset to Hugging Face
repo_id = "gbsalinetti/bigbudgetbrides-reddit-dataset"
dataset.push_to_hub(repo_id, token=HF_TOKEN)