In [1]:
import praw
import pandas as pd
import time
import datetime
from typing import List, Dict, Any


In [2]:
class RedditDataExtractor:
    def __init__(self, client_id, client_secret, user_agent):
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )
    
    def get_posts_with_comments(self, subreddit_name: str, limit: int = 100, 
                              posts_delay: float = 2.0, comments_delay: float = 1.0):
        print(f"Starting extraction from r/{subreddit_name}...")
        
        all_data = []
        subreddit = self.reddit.subreddit(subreddit_name)
        
        try:
            for i, post in enumerate(subreddit.hot(limit=limit)):
                try:
                    print(f"Processing post {i+1}/{limit}: {post.title[:50]}...")
                    
                    # Extract post data
                    post_data = self._extract_post_data(post)
                    all_data.append(post_data)
                    
                    # Extract comments as nested structure
                    comments_tree = self._extract_comments_hierarchy(post, comments_delay)
                    all_data.extend(comments_tree)
                    
                    if i < limit - 1:
                        time.sleep(posts_delay)
                        
                except Exception as e:
                    print(f"Error processing post {i+1}: {str(e)}")
                    continue
                    
        except Exception as e:
            print(f"Error accessing subreddit: {str(e)}")
            return []
            
        return all_data
    
    def _extract_post_data(self, post) -> Dict[str, Any]:
        return {
            'post_id': post.id,
            'comment_id': post.id,
            'parent_id': None,
            'level': 0,
            'level_path': '0',
            'type': 'post',
            'title': post.title,
            'body': self._clean_text(post.selftext),
            'author': str(post.author) if post.author else '[deleted]',
            'score': post.score,
            'upvote_ratio': post.upvote_ratio,
            'num_comments': post.num_comments,
            'created_utc': datetime.datetime.fromtimestamp(post.created_utc),
            'url': post.url,
            'permalink': post.permalink,
            'is_original_content': post.is_original_content,
            'is_self': post.is_self,
            'over_18': post.over_18,
            'spoiler': post.spoiler,
            'stickied': post.stickied
        }
    
    def _extract_comments_hierarchy(self, post, comments_delay: float) -> List[Dict[str, Any]]:
        comments_data = []
        
        try:
            post.comments.replace_more(limit=None)
            
            for i, comment in enumerate(post.comments):
                comment_path = f"0.{i}"
                self._process_comment_hierarchy(comment, comments_data, level=1, 
                                              parent_id=post.id, 
                                              level_path=comment_path,
                                              delay=comments_delay)
                
        except Exception as e:
            print(f"Error extracting comments for post {post.id}: {str(e)}")
            
        return comments_data
    
    def _process_comment_hierarchy(self, comment, comments_data: List, level: int, 
                                parent_id: str, level_path: str, delay: float):
        try:
            comment_data = {
                'post_id': getattr(comment, 'submission', None) and comment.submission.id or 'unknown',
                'comment_id': comment.id,
                'parent_id': parent_id,
                'level': level,
                'level_path': level_path,
                'type': 'comment',
                'title': '',
                'body': self._clean_text(comment.body),
                'author': str(comment.author) if comment.author else '[deleted]',
                'score': comment.score,
                'upvote_ratio': None,
                'num_comments': len(comment.replies) if hasattr(comment, 'replies') else 0,
                'created_utc': datetime.datetime.fromtimestamp(comment.created_utc),
                'url': f"https://reddit.com{comment.permalink}",
                'permalink': comment.permalink,
                'is_original_content': None,
                'is_self': None,
                'over_18': getattr(comment, 'over_18', False),
                'spoiler': getattr(comment, 'spoiler', False),
                'stickied': comment.stickied
            }
            
            comments_data.append(comment_data)
            
            time.sleep(delay)
            
            # Process replies with hierarchical numbering
            if hasattr(comment, 'replies'):
                for j, reply in enumerate(comment.replies):
                    reply_path = f"{level_path}.{j}"
                    self._process_comment_hierarchy(reply, comments_data, level + 1, 
                                                  parent_id=comment.id, 
                                                  level_path=reply_path,
                                                  delay=delay)
                    
        except Exception as e:
            print(f"Error processing comment {comment.id}: {str(e)}")
    
    def _clean_text(self, text: str) -> str:
        """Clean text for CSV/Excel"""
        if not text:
            return ""
        # Remove excessive newlines and clean text
        text = ' '.join(text.split())
        return text[:10000]  # Limit very long texts
    
    def save_to_excel_with_hierarchy(self, data: List[Dict[str, Any]], filename: str):
        """Save with hierarchical structure clearly visible"""
        if not data:
            print("No data to save!")
            return
        
        df = pd.DataFrame(data)
        
        # Create a display column that shows the hierarchy
        def create_display_row(row):
            indent = "  " * row['level']
            if row['type'] == 'post':
                return f"üìù POST: {row['title']}"
            else:
                author = row['author'] if row['author'] != '[deleted]' else 'deleted'
                return f"{indent}üí¨ [{row['level_path']}] {author}: {row['body'][:100]}..."
        
        df['display_hierarchy'] = df.apply(create_display_row, axis=1)
        
        # Reorder columns for better readability
        columns_order = [
            'display_hierarchy', 'level_path', 'level', 'type', 'post_id', 'comment_id', 'parent_id',
            'title', 'body', 'author', 'score', 'created_utc', 'num_comments',
            'upvote_ratio', 'url', 'permalink', 'is_original_content', 'is_self',
            'over_18', 'spoiler', 'stickied'
        ]
        
        # Only include existing columns
        existing_columns = [col for col in columns_order if col in df.columns]
        df = df[existing_columns]
        
        try:
            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='Reddit Data', index=False)
                
                worksheet = writer.sheets['Reddit Data']
                
                # Adjust column widths
                for column in worksheet.columns:
                    max_length = 0
                    column_letter = column[0].column_letter
                    for cell in column:
                        try:
                            if len(str(cell.value)) > max_length:
                                max_length = len(str(cell.value))
                        except:
                            pass
                    adjusted_width = min(max_length + 2, 50)
                    worksheet.column_dimensions[column_letter].width = adjusted_width
            
            print(f"‚úÖ Hierarchical data saved to {filename}")
            
        except ImportError:
            print("‚ùå openpyxl not installed. Saving as CSV instead.")
            self.save_to_csv_with_hierarchy(data, filename.replace('.xlsx', '.csv'))
    
    def save_to_csv_with_hierarchy(self, data: List[Dict[str, Any]], filename: str):
        """Save hierarchical data to CSV"""
        if not data:
            print("No data to save!")
            return
        
        df = pd.DataFrame(data)
        
        # Add display hierarchy column
        def create_display_row(row):
            indent = "  " * row['level']
            if row['type'] == 'post':
                return f"POST: {row['title']}"
            else:
                author = row['author'] if row['author'] != '[deleted]' else 'deleted'
                return f"{indent}[{row['level_path']}] {author}: {row['body'][:100]}..."
        
        df['display_hierarchy'] = df.apply(create_display_row, axis=1)
        
        # Reorder to put hierarchy first
        cols = df.columns.tolist()
        cols.remove('display_hierarchy')
        cols = ['display_hierarchy'] + cols
        df = df[cols]
        
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"‚úÖ Hierarchical data saved to {filename}")
    
    def print_hierarchy_preview(self, data: List[Dict[str, Any]], max_rows: int = 20):
        """Print a preview of the hierarchical structure"""
        if not data:
            print("No data to display!")
            return
        
        print("\n" + "="*80)
        print("HIERARCHICAL PREVIEW (Tree Structure)")
        print("="*80)
        
        for i, item in enumerate(data[:max_rows]):
            indent = "  " * item['level']
            if item['type'] == 'post':
                print(f"{indent}üìù POST: {item['title']}")
                print(f"{indent}     By: {item['author']} | Score: {item['score']} | Comments: {item['num_comments']}")
            else:
                print(f"{indent}üí¨ [{item['level_path']}] {item['author']}: {item['body'][:100]}...")
            
            if i < max_rows - 1:
                level_current = item['level']
                level_next = data[i+1]['level'] if i+1 < len(data) else 0
                if level_next < level_current:
                    print()  # Add space when going up levels
        
        if len(data) > max_rows:
            print(f"\n... and {len(data) - max_rows} more rows")

In [None]:
# Replace with your credentials
extractor = RedditDataExtractor(
    client_id="",         # your client id
    client_secret="",      # your client secret
    user_agent="" #user agent name
)

# Extract data
data = extractor.get_posts_with_comments(
    subreddit_name="",  #subreddit
    limit=400,  # how many post
    posts_delay=2.0, #post delay
    comments_delay=1.0 # comment delay
)

In [5]:
pd.DataFrame(data).to_csv("pharil2.csv", index=False)
print(f"‚úÖ Saved {len(data)} records to reddit_data.csv")

‚úÖ Saved 761 records to reddit_data.csv
