<a href="https://colab.research.google.com/github/giannisalinetti/dwllm/blob/main/Dataset_wedding_reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Subreddit Dataset generator for BidBudgetBrides

## Scrape data from Reddit

In [1]:
pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


In [2]:
import praw
import pandas as pd

In [None]:
from google.colab import userdata
import time

reddit = praw.Reddit(
    check_for_async = False,
    client_id = userdata.get('reddit_client_id'),
    client_secret = userdata.get('reddit_client_secret'),
    user_agent = "dataset_scraper/1.0",
)

subreddit_name = "BigBudgetBrides"

def fetch_hot_posts(subreddit_name, retries=3):
    """
    Fetches hot posts from a given subreddit and their comments, with error handling.

    Args:
        subreddit_name (str): The name of the subreddit.
        retries (int): The number of retries for API requests.
    """

    posts = []
    for attempt in range(retries):
        try:
            for post in reddit.subreddit(subreddit_name).hot(limit=None):
                post.comments.replace_more(limit=None)

                for comment in post.comments:
                    posts.append([post.title,
                                  post.score,
                                  post.url,
                                  post.num_comments,
                                  post.selftext,
                                  comment.body,
                                  comment.score,
                                  comment.created_utc
                    ])
                time.sleep(2)
            break
        except praw.exceptions.APIException as e:
            if "429" in str(e):
                wait_time = 60  # Wait 1 minute before retrying
                print(f"Rate limit hit! Waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Unexpected error: {e}")
                break  # Stop if another error occurs


fetch_hot_posts(subreddit_name)




## Create the dataframe and cleanup

In [28]:
# Create a dataframe
df = pd.DataFrame(posts, columns=["title",
                                  "score",
                                  "url",
                                  "num_comments",
                                  "selftext",
                                  "comment_body",
                                  "comment_score",
                                  "comment_timestamp"])


# Drop duplicate comments
df.drop_duplicates(subset=["comment_body"], inplace=True)

# Drop missing values
df.dropna(inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)


In [35]:
# Normalize text data
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize_text(text):
  text = str(text).lower()
  text = re.sub(r"http\S+|www.\S+", "", text) # Remove URLs
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # Remove special characters

  tokens = word_tokenize(text)
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # Remove stopwords and lemmatize words
  cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
  ]

  # Reconstruct sentence
  return " ".join(cleaned_tokens)


# Apply cleaning function
df["comment_body_clean"] = df["comment_body"].apply(clean_and_lemmatize_text)

# Convert timestamps to readable format
from datetime import datetime
df["comment_timestamp"] = pd.to_datetime(df["comment_timestamp"], unit="s")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Save local CSV

In [36]:
# Save to CSV
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/My Drive/datasets/big_budget_brides.csv"

df.to_csv(save_path, index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Convert to HuggingFace dataset format

In [37]:
# Install required packages
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [39]:
from datasets import Dataset

df = pd.read_csv("/content/drive/My Drive/datasets/big_budget_brides.csv")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Save as Hugging Face dataset format
dataset.save_to_disk("/content/drive/My Drive/datasets/big_budget_brides_hf")
print("Dataset saved successfully!")


Saving the dataset (0/1 shards):   0%|          | 0/11309 [00:00<?, ? examples/s]

Dataset saved successfully!


## Push to HuggingFace

In [40]:
!pip install huggingface_hub



In [41]:
from huggingface_hub import HfApi, HfFolder
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

# Save the token to use it for authentication
HfFolder.save_token(HF_TOKEN)

# Initialize API with token
api = HfApi(token=HF_TOKEN)


In [42]:
# Push dataset to Hugging Face
repo_id = "gbsalinetti/bigbudgetbrides-reddit-dataset"
dataset.push_to_hub(repo_id, token=HF_TOKEN)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/gbsalinetti/bigbudgetbrides-reddit-dataset/commit/48e60eaf527bb350f7cd734eb781fe61c21989f7', commit_message='Upload dataset', commit_description='', oid='48e60eaf527bb350f7cd734eb781fe61c21989f7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/gbsalinetti/bigbudgetbrides-reddit-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='gbsalinetti/bigbudgetbrides-reddit-dataset'), pr_revision=None, pr_num=None)