### **Phase 1: Data Collection**

**Objective:**
retrive data from Reddit using PRAW library

Key Questions: 
how to use praw and what are the data formarts i get ? 
how to create batches to respect rate limits?  

In [38]:
import praw
import pandas as pd
from typing import List, Dict

def initialize_reddit(client_id: str, client_secret: str, user_agent: str) -> praw.Reddit:
    """Initialize and return Reddit API instance."""
    return praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )

def get_posts(reddit: praw.Reddit, subreddits: List[str], method_limits: Dict[str, int], top_time_filter: str = 'month') -> pd.DataFrame:
    """
    Fetch posts from specified subreddits using different sorting methods.
    
    Args:
        reddit: Reddit API instance
        subreddits: List of subreddit names
        method_limits: Dictionary of method names and their post limits
        top_time_filter: Time filter for top posts ('hour', 'day', 'week', 'month', 'year', 'all')
    """
    all_posts = []
    
    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        
        for method, limit in method_limits.items():
            # Handle 'top' posts separately due to time_filter parameter
            if method == 'top':
                submissions = subreddit.top(limit=limit, time_filter=top_time_filter)
            else:
                submissions = getattr(subreddit, method)(limit=limit)
            
            # Extract post data
            for submission in submissions:
                all_posts.append({
                    'id': submission.id,
                    'title': submission.title,
                    'score': submission.score,
                    'num_comments': submission.num_comments,
                    'created_utc': submission.created_utc,
                    'subreddit': submission.subreddit.display_name,
                    'method': f"{method}_{top_time_filter}" if method == 'top' else method
                })
    
    return pd.DataFrame(all_posts)


# Configuration
SUBREDDITS = ['cryptocurrency']
CLIENT_ID = 'noE55of6CoertN468h29uw'
CLIENT_SECRET = 'dLzC05Xs2G9ZyjU8yrEVIoZsX3EYTA'
USER_AGENT = 'my_reddit_app:v1.0 (by_gigisducktales)'

# Define limits for each method
all_limit = 10

METHOD_LIMITS = {
    'hot': all_limit,
    'new': all_limit,
    'controversial': all_limit,
    'rising': all_limit,
    'top': all_limit
}

# Define time filter for top posts
TOP_TIME_FILTER = 'month'  # Options: 'hour', 'day', 'week', 'month', 'year', 'all'

# Initialize Reddit instance
reddit = initialize_reddit(CLIENT_ID, CLIENT_SECRET, USER_AGENT)

# Get posts and create DataFrame
df = get_posts(reddit, SUBREDDITS, method_limits=METHOD_LIMITS, top_time_filter=TOP_TIME_FILTER)

df 

Unnamed: 0,id,title,score,num_comments,created_utc,subreddit,method
0,1hww4fq,Moon Week 57,17,172,1736373000.0,CryptoCurrency,hot
1,1hz9one,"Daily Crypto Discussion - January 12, 2025 (GM...",22,272,1736640000.0,CryptoCurrency,hot
2,1hz3i4j,This explains everything,4340,133,1736623000.0,CryptoCurrency,hot
3,1hz6sge,Not Again...,1705,148,1736632000.0,CryptoCurrency,hot
4,1hz3s9e,16 years ago today,679,73,1736624000.0,CryptoCurrency,hot
5,1hzi426,UK judge says there’s no “reasonable grounds” ...,63,56,1736670000.0,CryptoCurrency,hot
6,1hzayu2,Coinbase's $300k software engineers aren't hap...,227,46,1736644000.0,CryptoCurrency,hot
7,1hz4lac,This historical pattern sets date when Bitcoin...,476,162,1736626000.0,CryptoCurrency,hot
8,1hzlmao,Washington pastor accused of stealing $5.9M fo...,22,19,1736685000.0,CryptoCurrency,hot
9,1hyzg4u,Biden Pushes Controversial Crypto Regulation i...,809,474,1736613000.0,CryptoCurrency,hot


In [44]:
df1 = df.copy()
df1.drop_duplicates(subset='id',inplace=True)
# df1['created'] = pd.to_datetime(df['created_utc'],unit='s')
# df1.index = df1['created']
# df1.drop(columns=['created_utc','created'],inplace=True)
df1.head()


Unnamed: 0,id,title,score,num_comments,created_utc,subreddit,method


### get comments from methods 

In [49]:
import tqdm

# Function to retrieve comments for a submission
def get_comments(submission_id):
    submission = reddit.submission(id=submission_id)
    submission.comments.replace_more(limit=None)
    comments = submission.comments.list()
    comments_data = []
    errocount = 0
    try:
        for comment in comments:
            comments_data.append({
                'submission_id': submission_id,
                'comment_id': comment.id,
                'comment_body': comment.body,
                'comment_score': comment.score,
                'comment_created_utc': comment.created_utc
            })
    except Exception as e:
        errocount += 1
        # print(f"Error: {e}")
        comments_data.append({
            'submission_id': submission_id,
            'comment_id': e,
            'comment_body': e,
            'comment_score': e,
            'comment_created_utc': e
        })

    return comments_data
    

# Retrieve comments for each submission and create a DataFrame
all_comments = []
for submission_id in tqdm(df1['id'], desc="Processing submissions"):
    if pd.notna(submission_id):  # Ensure submission_id is not NaN
        all_comments.extend(get_comments(submission_id))


comments_df = pd.DataFrame(all_comments)

# Merge comments DataFrame with the original DataFrame
merged_df2 = df1.merge(comments_df, left_on='id', right_on='submission_id', how='left')

merged_df2




Unnamed: 0,id,title,score,num_comments,created_utc,subreddit,method,submission_id,comment_id,comment_body,comment_score,comment_created_utc
0,1hww4fq,Moon Week 57,17,172,1.736373e+09,CryptoCurrency,hot,1hww4fq,m68pyxe,"Hi everyone, as some of you noticed the snapsh...",1,1.736438e+09
1,1hww4fq,Moon Week 57,17,172,1.736373e+09,CryptoCurrency,hot,1hww4fq,m651zvq,"Another 28,800 MOON was just burned up today!!...",16,1.736381e+09
2,1hww4fq,Moon Week 57,17,172,1.736373e+09,CryptoCurrency,hot,1hww4fq,m65deco,Screw crypto.com for delisting us! \n\nWe will...,9,1.736384e+09
3,1hww4fq,Moon Week 57,17,172,1.736373e+09,CryptoCurrency,hot,1hww4fq,m64hfai,MOON week today. \n\n \nA bit of a hurdle for...,7,1.736374e+09
4,1hww4fq,Moon Week 57,17,172,1.736373e+09,CryptoCurrency,hot,1hww4fq,m6lj805,what is moon? Where do you buy moon? So many m...,7,1.736614e+09
...,...,...,...,...,...,...,...,...,...,...,...,...
10489,1hy8yg6,Rich only in pain...,4913,190,1.736528e+09,CryptoCurrency,top_month,1hy8yg6,m6g6u49,I didn't misread it. I was just commenting on ...,-13,1.736536e+09
10490,1hy8yg6,Rich only in pain...,4913,190,1.736528e+09,CryptoCurrency,top_month,1hy8yg6,m6k5fj8,lol ya hurt to think about it sometimes of all...,2,1.736593e+09
10491,1hy8yg6,Rich only in pain...,4913,190,1.736528e+09,CryptoCurrency,top_month,1hy8yg6,m6jz7ef,Yes we need their blood to live forever,3,1.736589e+09
10492,1hy8yg6,Rich only in pain...,4913,190,1.736528e+09,CryptoCurrency,top_month,1hy8yg6,m6g8ydx,I didn't start with crypto till 3 years ago wh...,6,1.736536e+09


## Commonly Used Methods

1. get all the subreddits with the methods .hot() .new() .controverisal() .rising() .top() 

2. combine in dataframe 

3. filter post using keywords and phrases for particular coin 
    - get rid of double entries 

4. get all comments from relevant posts and add to dataframe 

5. filter post using keywords and phrases for particular coin again

6. sort comments by time 

7. use cryptobert to give submissions numerical value 

challange of valuation: what value is is given to recent data ? maybe it should go a little bit like an ema where there is more weight on the most recent sentiment or maybe now value at all to the past ? what is a good time frame daily metric ? 





`subreddit.hot(limit=10)`
- **Description:** Fetches the hot posts from the subreddit. Hot posts are those that are currently popular and receiving a lot of attention.
1. cluster post using keywordsand phrases 



`subreddit.new(limit=10)`
- **Description:** Fetches the newest posts from the subreddit, sorted from newest to oldest.
1. cluster post using keywordsand phrases 


`subreddit.top(limit=10, time_filter='month')`
- **Description:** Fetches the top posts from the subreddit based on score. You can specify a time filter (e.g., 'day', 'week', 'month', 'year', 'all').
1. cluster post using keywordsand phrases 


`subreddit.controversial(limit=10)`
- **Description:** Fetches the most controversial posts from the subreddit. Controversial posts are those with a high number of upvotes and downvotes.
1. cluster post using keywordsand phrases 


`subreddit.rising(limit=10)`
- **Description:** Fetches posts that are gaining popularity quickly.
1. cluster post using keywordsand phrases 



In [53]:

m_df = merged_df.copy()

m_df['created_utc'] = pd.to_datetime(m_df['created_utc'],unit='s')
m_df['comment_created_utc'] = pd.to_datetime(m_df['comment_created_utc'],unit='s')
m_df['created'] = m_df['created_utc'].combine_first(m_df['comment_created_utc'])
m_df

ValueError: unit='s' not valid with non-numerical val=''MoreComments' object has no attribute 'body'', at position 38

In [52]:
# 
merged_df2.info()
merged_df2.to_pickle('df_all_comments.pkl')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10494 entries, 0 to 10493
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10494 non-null  object 
 1   title                10494 non-null  object 
 2   score                10494 non-null  int64  
 3   num_comments         10494 non-null  int64  
 4   created_utc          10494 non-null  float64
 5   subreddit            10494 non-null  object 
 6   method               10494 non-null  object 
 7   submission_id        10494 non-null  object 
 8   comment_id           10494 non-null  object 
 9   comment_body         10494 non-null  object 
 10  comment_score        10494 non-null  int64  
 11  comment_created_utc  10494 non-null  float64
dtypes: float64(2), int64(3), object(7)
memory usage: 983.9+ KB


In [7]:
submission = reddit.submission(id=df['id'][0]) 

comments = []
submission.comments.replace_more(limit=None)  # Load all comments
for comment in submission.comments.list():
    print(f"Author: {comment.author}, Comment: {comment.body}, Score: {comment.score}")
    comments.append({
        'author': str(comment.author),
        'comment': comment.body,
        'score': comment.score
    })

Author: MaeronTargaryen, Comment: Hi everyone, as some of you noticed the snapshot file was wrong. Thanks for notifying us, it has now been updated, Score: 1
Author: coinsRus-2021, Comment: Another 28,800 MOON was just burned up today!! 🔥

https://x.com/coinsrus2021/status/1877141460849430871?s=46&t=LyBkcatAp59BnttrnsMLOA, Score: 15
Author: Maleficent_Sound_919, Comment: Screw crypto.com for delisting us! 

We will go on!, Score: 8
Author: the_far_yard, Comment: MOON week today. 

  
A bit of a hurdle for everyone, but we've got good mods and community to bring back Moons onto some exchanges. It's a project that's one marketing ploy away from being noticed. 

Happy to be a part of this group, and congrats on to everyone who's able to get some new Moons!, Score: 8
Author: sadiq_238, Comment: Wrong file, Score: 5
Author: Abysskitten, Comment: Boo! This is the old snapshot.

![gif](giphy|F3BeiZNq6VbDwyxzxF), Score: 5
Author: DBRiMatt, Comment: Surprised this poll wasn't proposed sooner to

In [9]:
df_comments = pd.DataFrame(comments)
df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   author   164 non-null    object
 1   comment  164 non-null    object
 2   score    164 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 4.0+ KB


In [22]:
# Search for subreddits related to a keyword
keyword = 'cryptocurrency'
subreddits = reddit.subreddits.search(keyword, limit=100)

# x = []  
# # Print the names of the subreddits found
# for subreddit in subreddits:
#     print(subreddit.display_name)
#     x.append(subreddit.display_name)

x = []  
# Print the names and additional information of the subreddits found
for subreddit in subreddits:
    print(f"Subreddit: {subreddit.display_name}")
    print(f"Subscribers: {subreddit.subscribers}")
    print(f"Active Users: {subreddit.accounts_active}")
    print(f"Description: {subreddit.public_description}")
    print("-" * 40)
    x.append(subreddit.display_name)

Subreddit: CryptoCurrency
Subscribers: 9386804
Active Users: None
Description: The leading community for cryptocurrency news, discussion, and analysis.
----------------------------------------
Subreddit: CryptocurrencyICO
Subscribers: 190448
Active Users: None
Description: r/Cryptocurrency & ICO is a hub for sharing crypto news & discussing new innovative ICO quality projects with proven utility.
----------------------------------------
Subreddit: CryptocurrencyReviews
Subscribers: 32900
Active Users: None
Description: This is a place for cryptocurrency discussion, which is the next hottest crypto? What exchange is great for trading? What are your forecasts of price? Share it here.
----------------------------------------
Subreddit: Bitcoin
Subscribers: 7555510
Active Users: None
Description: Bitcoin is the currency of the Internet: a distributed, worldwide, decentralized digital money. Unlike traditional currencies such as dollars, bitcoins are issued and managed without any central a

In [21]:
x = []  
# Print the names and additional information of the subreddits found
for subreddit in subreddits:
    print(f"Subreddit: {subreddit.display_name}")
    print(f"Subscribers: {subreddit.subscribers}")
    print(f"Active Users: {subreddit.accounts_active}")
    print(f"Description: {subreddit.public_description}")
    print("-" * 40)
    x.append(subreddit.display_name)

## example code for berttopic an spcay pre processing 


In [None]:
import praw
import pandas as pd
import spacy
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the Reddit instance
reddit = praw.Reddit(
    client_id='your_client_id',
    client_secret='your_client_secret',
    user_agent='my_reddit_app:v1.0 (by /u/your_reddit_username)'
)

# List of cryptocurrency-related subreddits
crypto_subreddits = ['cryptocurrency', 'Bitcoin', 'CryptoMarkets', 'CryptoCurrencyTrading']

# Fetch submissions from these subreddits
posts_data = []
for subreddit_name in crypto_subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    for submission in subreddit.new(limit=100):  # Fetch the newest 100 submissions
        posts_data.append({
            'title': submission.title,
            'selftext': submission.selftext,
            'score': submission.score,
            'id': submission.id,
            'url': submission.url,
            'num_comments': submission.num_comments,
            'created': submission.created_utc,
            'author': str(submission.author),
            'subreddit': subreddit_name
        })

# Create a DataFrame from the posts data
df_posts = pd.DataFrame(posts_data)
df_posts['created'] = pd.to_datetime(df_posts['created'], unit='s')

# Combine title and selftext for clustering
df_posts['text'] = df_posts['title'] + ' ' + df_posts['selftext']

# Preprocess text data using SpaCy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

df_posts['processed_text'] = df_posts['text'].apply(preprocess_text)

# Initialize BERTopic model
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

# Fit the model on the processed text data
topics, probabilities = topic_model.fit_transform(df_posts['processed_text'].tolist())

# Add the topics to the DataFrame
df_posts['topic'] = topics

# Save the clustered data to a CSV file
df_posts.to_csv('reddit_posts_bertopic.csv', index=False)

# Visualize the topics
topic_model.visualize_topics()