In [None]:
#farm made srcaping 


import requests
from bs4 import BeautifulSoup
import json
import time
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import logging

# Configure logging
logging.basicConfig(filename='scraper_errors.log', level=logging.ERROR)


def create_robust_session():
    """Create a session with retry strategy"""
    session = requests.Session()
    retries = Retry(
        total=5,  # number of retries
        backoff_factor=1,  # wait 1, 2, 4, 8, 16 seconds between retries
        status_forcelist=[500, 502, 503, 504]  # retry on these status codes
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def extract_website_text(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
        }

        print(f"Starting extraction from: {url}")
        session = create_robust_session()

        # Make request with increased timeout and chunked response
        response = session.get(url, headers=headers, timeout=60, stream=True)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        content = ''
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                content += chunk.decode('utf-8', errors='ignore')

        soup = BeautifulSoup(content, 'html.parser')
        return soup

    except requests.RequestException as e:
        print(f"Request error: {e}")
        logging.error(f"Request error: {e} for URL: {url}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        logging.error(f"Unexpected error: {e} for URL: {url}")
        return None
    finally:
        if 'session' in locals():
            session.close()


def extract_product_details(soup):
    """Extracts product details from a product page."""
    details = {}
    try:
        details['name'] = soup.find('h1', class_='product-single__title').text.strip()  # Example
    except AttributeError:  # Handle cases where the element is not found
        details['name'] = None
    try:
        details['price'] = soup.find('span', class_='money').text.strip()  # Example
    except AttributeError:
        details['price'] = None
    try:
        details['description'] = soup.find('div', class_='product-single__description').text.strip()  # Example
    except AttributeError:
        details['description'] = None
    # Extract images, variants, etc.
    return details


def extract_about_us(soup):
    """Extracts 'about us' information from the about page."""
    about_info = {}
    try:
        about_info['content'] = soup.find('div', class_='rte').text.strip()  # Example
    except AttributeError:
        about_info['content'] = None
        print("Could not extract about us content")
        
    try:
        about_info['title'] = soup.find('h1', class_='page-title').text.strip()
        print(f"Extracted about us title: {about_info['title']}")
    except AttributeError:
        about_info['title'] = None
        print("Could not extract about us title")
        
    return about_info


def clean_extracted_data(data):
    """Clean and format the extracted data"""
    cleaned = {}
    for key, value in data.items():
        if isinstance(value, str):
            cleaned[key] = value.strip()
        elif isinstance(value, list):
            cleaned[key] = [item.strip() for item in value if item.strip()]
    return cleaned


def save_extracted_data(data, filename='extracted_data.json'):
    """Save the extracted data to a JSON file"""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, indent=4, ensure_ascii=False, fp=f)
        print(f"\nData saved to {filename}")
    except Exception as e:
        print(f"Error saving data: {e}")
        logging.error(f"Error saving data: {e}")


def main():
    urls = {
        'free_range_eggs': 'https://farmmadefoods.com/collections/free-range-eggs?customer_posted=true',
        'product_page_1': 'https://farmmadefoods.com/products/farm-made-foods-free-range-eggs-6',  # Example product url Replace with a real product page
        'product_page_2': 'https://farmmadefoods.com/products/natural-coconut-sugar',  # Replace with a real product page
        'product_page_3': 'https://farmmadefoods.com/products/coconut-sugar-delights',  # Replace with a real product page
        'blog_page_1': 'https://farmmadefoods.com/blogs/recipe/sunny-side-up-eggs',  # Replace with a real product page
        'blog_page_2': 'https://farmmadefoods.com/blogs/recipe/mixed-herbs-omelette-easy-healthy-breakfast-recipe',  # Replace with a real product page
        'blog_page_3': 'https://farmmadefoods.com/blogs/recipe/english-breakfast-recipe-easy-guide',  # Replace with a real product page
        'blog_page_4': 'https://farmmadefoods.com/blogs/recipe/eggcellent-zesty-delight',  # Replace with a real product page
        'blog_page_5': 'https://farmmadefoods.com/blogs/recipe/scrambled-eggs-recipe',  # Replace with a real product page
        'contact_page_1': 'https://farmmadefoods.com/pages/contact',  # Replace with a real product page
        'about': 'https://farmmadefoods.com/pages/about-us',  # Replace with a real product page
        'return_policy': 'https://farmmadefoods.com/pages/refund-policy'  # Replace with the actual about us page URL
    }

    all_data = {}

    for key, url in urls.items():
        soup = extract_website_text(url)
        if soup:
            if key == 'free_range_eggs':
                all_data['free_range_eggs'] = {'title': soup.find('title').text.strip()}  # Example extract
            elif key == 'about':
                all_data['about'] = extract_about_us(soup)
            elif key.startswith('product_page'):
                all_data[key] = extract_product_details(soup)
            #else:
            #    all_data[key] = {'title': soup.find('title').text.strip()} # General title extraction
        else:
            print(f"Failed to extract data from {url}")

    # Save all data to a single JSON file
    with open('combined_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, indent=4, ensure_ascii=False, fp=f)
    print("Data saved to combined_data.json")


if __name__ == "__main__":
    # url = "https://farmmadefoods.com/collections/free-range-eggs?customer_posted=true#FooterNewsletter"  # Example URL
    # print("Starting extraction...")
    #
    # start_time = time.time()
    # extracted_content = extract_website_text(url)
    # end_time = time.time()
    #
    # if extracted_content:
    #     print(f"\nExtraction completed in {end_time - start_time:.2f} seconds")
    #     print("\nExtracted Content Summary:")
    #     print("-" * 50)
    #
    #     for key, value in extracted_content.items():
    #         if isinstance(value, str):
    #             print(f"\n{key.upper()}:")
    #             print(value[:200] + "..." if len(value) > 200 else value)
    #         elif isinstance(value, list):
    #             print(f"\n{key.upper()} ({len(value)} items):")
    #             for item in value[:3]:  # Show first 3 items
    #                 print(f"- {item[:100]}..." if len(item) > 100 else f"- {item}")
    #             if len(value) > 3:
    #                 print(f"... and {len(value)-3} more items")
    #
    #     # Save the data
    #     save_extracted_data(extracted_content)
    # else:
    #     print("No content was extracted")

    main()


In [None]:
!pip install praw

In [5]:
#reddit
#market analysis
import praw
import json
import time

# Initialize Reddit API connection
user_agent = "market_research_bot 1.0"
reddit = praw.Reddit(
    client_id='E7lnxu7KkXI9HWF0cWOFww',
    client_secret='Ye__fRBy5a8753p62oIF9jdEt3j9Yw', 
    user_agent=user_agent
)

# Define search parameters
subreddits = ["eggs", "farming", "agriculture", "food", "business"]
keywords = [
    "egg prices", "egg market", "egg industry", "egg production",
    "egg supply", "egg demand", "egg shortage", "egg surplus",
    "egg farm", "egg producer", "egg wholesale", "egg retail"
]
limit = 200  # Posts per keyword

market_data = []

# Search across multiple subreddits
for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    
    for keyword in keywords:
        try:
            # Search for market-related posts
            for submission in subreddit.search(keyword, limit=limit):
                post_data = {
                    'subreddit': subreddit_name,
                    'keyword': keyword,
                    'title': submission.title,
                    'content': submission.selftext,
                    'url': submission.url,
                    'author': str(submission.author),
                    'score': submission.score,
                    'upvote_ratio': submission.upvote_ratio,
                    'num_comments': submission.num_comments,
                    'created': time.strftime('%Y-%m-%d %H:%M:%S', 
                                          time.localtime(submission.created_utc)),
                    'comments': []
                }

                # Get relevant comments discussing prices, supply, demand etc.
                submission.comments.replace_more(limit=0)
                for comment in submission.comments.list():
                    if any(kw in comment.body.lower() for kw in 
                          ['price', 'cost', 'market', 'supply', 'demand', 'shortage']):
                        comment_data = {
                            'author': str(comment.author),
                            'text': comment.body,
                            'score': comment.score,
                            'created': time.strftime('%Y-%m-%d %H:%M:%S',
                                                   time.localtime(comment.created_utc))
                        }
                        post_data['comments'].append(comment_data)

                market_data.append(post_data)
            
            print(f"Collected data for keyword '{keyword}' in r/{subreddit_name}")
            time.sleep(2)  # Rate limiting
            
        except Exception as e:
            print(f"Error collecting data from r/{subreddit_name} for '{keyword}': {str(e)}")
            continue

# Save market research data
with open('egg_market_data.json', 'w', encoding='utf-8') as f:
    json.dump(market_data, f, indent=4, ensure_ascii=False)

print(f"\nCollected {len(market_data)} relevant posts")
print("Market research data saved to egg_market_data.json")


Collected data for keyword 'egg prices' in r/eggs


KeyboardInterrupt: 

In [4]:
#comment 
#reddit
import praw
import time
import logging

# Configure logging
logging.basicConfig(filename='reddit_scraper.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_reddit(subreddit_name, keywords, limit=100):
    """
    Scrapes Reddit for posts and comments related to the specified keywords.
    Args:
        subreddit_name (str): The name of the subreddit to scrape.
        keywords (list): A list of keywords to search for.
        limit (int): The maximum number of posts to retrieve.
    Returns:
        list: A list of dictionaries, each containing data from a relevant post.
    """
    try:
        # Authenticate with Reddit API.  Replace with your credentials
        user_agent = "scapper 1.0 by/u/Business-Till-1699"
        reddit = praw.Reddit(
            client_id='E7lnxu7KkXI9HWF0cWOFww', 
            client_secret='Ye__fRBy5a8753p62oIF9jdEt3j9Yw',
            user_agent=user_agent
        )

        subreddit = reddit.subreddit(subreddit_name)
        relevant_posts = []

        for submission in subreddit.search(query=" OR ".join(keywords), sort="relevance", limit=limit):
            post_data = {
                'title': submission.title,
                'url': submission.url,
                'author': str(submission.author),
                'subreddit': subreddit_name,
                'upvote_ratio': submission.upvote_ratio,
                'num_comments': submission.num_comments,
                'score': submission.score,
                'created': time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(submission.created)),
                'keywords': keywords,
                'comments': []
            }

            # Extract comments
            for comment in submission.comments.list():
                comment_data = {
                    'author': str(comment.author),
                    'body': comment.body,
                    'score': comment.score,
                    'created': time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(comment.created))
                }
                post_data['comments'].append(comment_data)

            relevant_posts.append(post_data)
            logging.info(f"Extracted post: {submission.title}")

        return relevant_posts

    except Exception as e:
        logging.error(f"An error occurred during scraping: {e}")
        return []

def main():
    # Define subreddit and keywords
    subreddit_name = "eggs"  # Adjust as needed
    keywords = ["eggs", "egg market", "news on eggs", "news on farm made company", "eggs market value",
                "eggs consumer market", "eggs requirement"]
    limit = 100  # Number of posts to extract

    # Scrape Reddit
    reddit_data = scrape_reddit(subreddit_name, keywords, limit)

    # Save data to JSON file
    try:
        with open("reddit_eggs_data.json", "w", encoding="utf-8") as f:
            json.dump(reddit_data, f, indent=4, ensure_ascii=False)
        print("Successfully saved data to reddit_eggs_data.json")
        logging.info("Successfully saved data to reddit_eggs_data.json")
    except Exception as e:
        logging.error(f"Error saving data to JSON file: {e}")

if __name__ == "__main__":
    main()


Successfully saved data to reddit_eggs_data.json


In [29]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install nltk
!pip install transformers
!pip install scikit-learn
!pip install bertopic
!pip install keybert
!pip install gensim
!pip install numpy

Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.0.2-cp310-cp310-win_amd64.whl.metadata (59 kB)
Using cached numpy-2.0.2-cp310-cp310-win_amd64.whl (15.9 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-2.0.2


  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.0.2 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.14.1 which is incompatible.
tensorflow-intel 2.17.0 requires numpy<2.0.0,>=1.23.5; python_version <= "3.11", but you have numpy 2.0.2 which is incompatible.

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 796.8 kB/s eta 0:00:16
     -- ------------------------------------- 0.8/12.8 MB 1.0 MB/s eta 0:00:12
     --- ----------------------------------- 1.0/12.8 MB 986.7 kB/s eta 0:00:12
     --- ----------------------------------- 1.0/12.8 MB 986.7 kB/s eta 0:00:12
     ---- ----------------------------------- 1.3/12.8 MB 1.0 MB/s eta 0:00:12
     ---- ----------------------------------- 1.6/12.8 MB 1.0 MB/s eta 0:00:11
     ----- ---------------------------------- 1.8/12.8 MB 1.1 MB/s eta 0:00:11
     ------ --------------------------------- 2.1/12


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting bertopic


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting plotly>=4.7.0 (from bertopic)
  Downloading plotly-6.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting numba>=0.51.2 (from umap-learn>=0.5.0->bertopic)
  Downloading numba-0.61.0-cp310-cp310-win_amd64.whl.metadata (2.8 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.2->umap-learn>=0.5.0->bertopic)
  Downloading llvmlite-0.44.0-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
Downloading hdbscan-0.8.40-cp310-cp310-win_amd64.whl (730 kB)
   ---------------------------------------- 0.0/730.9 kB ? eta -:--


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
Using cached scipy-1.13.1-cp310-cp310-win_amd64.whl (46.2 MB)
Installing collected packages: numpy, scipy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
Successfully installed numpy-1.26.4 scipy-1.13.1


  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blis 1.0.1 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
thinc 8.3.2 requires numpy<2.1.0,>=2.0.0; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
# Import required libraries
import pandas as pd
import json
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

def clean_text(text):
    """Basic text cleaning"""
    if not isinstance(text, str):
        return ""
    # Remove special characters and extra whitespace
    text = ' '.join(text.split())
    text = text.lower()
    return text

def get_bert_embeddings(text, tokenizer, model):
    """Get BERT embeddings for text"""
    # Tokenize and get BERT embeddings
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use [CLS] token embedding as text representation
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings[0]

def preprocess_data_with_bert():
    try:
        # Load BERT tokenizer and model
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        
        # Move model to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        
        # Load data
        with open('egg_market_data.json', 'r', encoding='utf-8') as f:
            market_data = json.load(f)
        with open('reddit_eggs_data.json', 'r', encoding='utf-8') as f:
            reddit_data = json.load(f)

        # Process market data
        cleaned_market_data = []
        for post in tqdm(market_data, desc="Processing market data"):
            # Clean and combine relevant text fields
            title = clean_text(post.get('title', ''))
            content = clean_text(post.get('content', ''))
            combined_text = f"{title} {content}"
            
            if combined_text.strip():  # Only process non-empty text
                # Get BERT embeddings
                embeddings = get_bert_embeddings(combined_text, tokenizer, model)
                
                # Create cleaned post object
                cleaned_post = {
                    'title': title,
                    'content': content,
                    'created': post.get('created', ''),
                    'score': post.get('score', 0),
                    'bert_embeddings': embeddings.tolist(),
                    'comments': []
                }
                
                # Process comments
                for comment in post.get('comments', []):
                    comment_text = clean_text(comment.get('text', ''))
                    if comment_text:
                        comment_embeddings = get_bert_embeddings(comment_text, tokenizer, model)
                        cleaned_post['comments'].append({
                            'text': comment_text,
                            'score': comment.get('score', 0),
                            'created': comment.get('created', ''),
                            'bert_embeddings': comment_embeddings.tolist()
                        })
                        
                cleaned_market_data.append(cleaned_post)

        # Process reddit data similarly
        cleaned_reddit_data = []
        for post in tqdm(reddit_data, desc="Processing reddit data"):
            title = clean_text(post.get('title', ''))
            body = clean_text(post.get('body', ''))
            combined_text = f"{title} {body}"
            
            if combined_text.strip():
                embeddings = get_bert_embeddings(combined_text, tokenizer, model)
                
                cleaned_post = {
                    'title': title,
                    'body': body,
                    'created': post.get('created', ''),
                    'score': post.get('score', 0),
                    'bert_embeddings': embeddings.tolist(),
                    'comments': []
                }
                
                for comment in post.get('comments', []):
                    comment_text = clean_text(comment.get('body', ''))
                    if comment_text:
                        comment_embeddings = get_bert_embeddings(comment_text, tokenizer, model)
                        cleaned_post['comments'].append({
                            'text': comment_text,
                            'score': comment.get('score', 0),
                            'created': comment.get('created', ''),
                            'bert_embeddings': comment_embeddings.tolist()
                        })
                        
                cleaned_reddit_data.append(cleaned_post)

        # Save processed data
        with open('processed_market_data.json', 'w', encoding='utf-8') as f:
            json.dump(cleaned_market_data, f, indent=4, ensure_ascii=False)
        with open('processed_reddit_data.json', 'w', encoding='utf-8') as f:
            json.dump(cleaned_reddit_data, f, indent=4, ensure_ascii=False)

        print("Data preprocessing complete. Files saved as processed_market_data.json and processed_reddit_data.json")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Execute the preprocessing
if __name__ == "__main__":
    preprocess_data_with_bert()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing market data: 100%|██████████| 449/449 [28:29<00:00,  3.81s/it]    
Processing reddit data: 100%|██████████| 123/123 [19:54<00:00,  9.71s/it] 


Data preprocessing complete. Files saved as processed_market_data.json and processed_reddit_data.json


In [5]:
!pip install -r requirements.txt
!pip install transformers gensim numpy torch tensorflow pandas scikit-learn


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'





[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip install transformers gensim numpy torch tensorflow pandas scikit-learn


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from bertopic import BERTopic
import numpy as np
import json
import logging
from datetime import datetime
from collections import defaultdict
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(filename='conversation_starter.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Load data
try:
    with open('processed_reddit_data.json', 'r', encoding='utf-8') as f:
        combined_data = json.load(f)
    logging.info("Data loaded successfully.")
except FileNotFoundError as e:
    logging.error(f"Data file not found: {e}")
    raise
except json.JSONDecodeError as e:
    logging.error(f"JSON decode error: {e}")
    raise
except Exception as e:
    logging.error(f"Unexpected error loading data: {e}")
    raise

# Prepare documents for topic modeling
documents = []
for post in combined_data:
    body = post.get('body', '')
    if isinstance(body, str):  # Ensure the body is a string
        documents.append(body)
    else:
        logging.warning(f"Skipping non-string body: {body}")

logging.info(f"Prepared {len(documents)} documents for topic modeling.")

# Advanced Topic Modeling with BERTopic
try:
    topic_model = BERTopic(language="english", calculate_probabilities=True)
    topics, _ = topic_model.fit_transform(documents)
    logging.info("BERTopic model trained successfully.")
except Exception as e:
    logging.error(f"BERTopic model training failed: {e}")
    raise

# Visualize Topics
try:
    topic_model.visualize_topics().write_html("topic_visualization.html")
    logging.info("Topic visualization saved to topic_visualization.html.")
except Exception as e:
    logging.error(f"Error visualizing topics: {e}")

# Load Gemma Model
try:
    model_name = "google/gemma-2b"  # Use a larger model if available
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    logging.info("Gemma model loaded successfully.")
except Exception as e:
    logging.error(f"Failed to load language model: {e}")
    raise

def generate_conversation_starter(topic_words):
    """Generates a conversation starter using the Gemma model."""
    try:
        prompt = f"Generate an engaging question about eggs and market demand based on these topics: {', '.join(topic_words)}"
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(
            inputs.input_ids,
            max_length=100,
            num_return_sequences=1,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        logging.error(f"Error generating conversation starter: {e}")
        return None

# Find Egg-Related Topics and Generate Starters
egg_related_posts = []
for idx, post in enumerate(combined_data):
    try:
        # Get topic distribution for post
        topic_dist = topic_model.get_document_info(documents[idx]).Topic
        
        # Check if any topics are egg-related
        topic_words = topic_model.get_topic(topic_dist)
        if any(word in ['egg', 'eggs', 'dha', 'chicken', 'farm'] for word, _ in topic_words):
            # Calculate engagement score
            engagement_score = post.get('score', 0) + sum(c.get('score', 0) for c in post.get('comments', []))
            engagement_score *= (1 + topic_model.get_document_info(documents[idx]).Probability)

            # Generate conversation starter
            conversation_starter = generate_conversation_starter([word for word, _ in topic_words])
            if conversation_starter:
                egg_related_posts.append({
                    'title': post.get('title', ''),
                    'engagement_score': engagement_score,
                    'topic_probability': float(topic_model.get_document_info(documents[idx]).Probability),
                    'topic_words': [word for word, _ in topic_words],
                    'conversation_starter': conversation_starter
                })
    except Exception as e:
        logging.error(f"Error processing post {idx}: {e}")
        continue

# Sort by engagement score
egg_related_posts.sort(key=lambda x: x['engagement_score'], reverse=True)

# Print results
print("\nTop High Engagement Egg-Related Conversation Starters:")
for post in egg_related_posts[:5]:
    print(f"\nTitle: {post['title']}")
    print(f"Engagement Score: {post['engagement_score']:.2f}")
    print(f"Topic Probability: {post['topic_probability']:.2f}")
    print(f"Topic Words: {', '.join(post['topic_words'])}")
    print(f"Conversation Starter: {post['conversation_starter']}")

# Save results
try:
    with open("egg_conversation_starters.json", "w", encoding="utf-8") as f:
        json.dump(egg_related_posts, f, indent=4, ensure_ascii=False)
    logging.info("Successfully saved conversation starters to JSON file.")
except Exception as e:
    logging.error(f"Error saving to JSON file: {e}")

# Visualize Engagement Scores
try:
    engagement_scores = [post['engagement_score'] for post in egg_related_posts]
    plt.hist(engagement_scores, bins=20, edgecolor='black')
    plt.xlabel("Engagement Score")
    plt.ylabel("Frequency")
    plt.title("Distribution of Engagement Scores")
    plt.savefig("engagement_scores_distribution.png")
    plt.close()
    logging.info("Engagement scores visualization saved to engagement_scores_distribution.png.")
except Exception as e:
    logging.error(f"Error visualizing engagement scores: {e}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [14]:
# Prepare documents for topic modeling
documents = [post.get('title', '') for post in combined_data if post.get('title')]

In [27]:
from nltk.corpus import stopwords
import nltk
import re

# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')

# Define a preprocessing function
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Preprocess documents
documents = [preprocess_text(post.get('title', '')) for post in combined_data if post.get('title')]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# Define a custom CountVectorizer with stop words
custom_stop_words = ["the", "for", "in", "and", "is", "of", "a", "to"]
vectorizer_model = CountVectorizer(stop_words=custom_stop_words)

# Initialize BERTopic with the custom vectorizer
topic_model = BERTopic(
    language="english",
    calculate_probabilities=True,
    vectorizer_model=vectorizer_model
)

In [30]:
import json

# Step 1: Load your JSON dataset
with open("processed_market_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Step 2: Calculate engagement score (post score + sum of all comment scores)
for post in data:
    # Sum the scores of all comments
    total_comment_score = sum(comment.get("score", 0) for comment in post.get("comments", []))
    # Calculate engagement score
    engagement_score = post.get("score", 0) + total_comment_score
    # Add the engagement score to the post
    post["engagement_score"] = engagement_score

# Step 3: Sort posts by engagement score (descending)
sorted_data = sorted(data, key=lambda x: x.get("engagement_score", 0), reverse=True)

# Step 4: Select top 100 posts
top_100_posts = sorted_data[:100]

# Step 5: Print top 100 posts to console
print("Top 100 Posts:")
for i, post in enumerate(top_100_posts):
    print(f"\nRank {i + 1}:")
    print(f"Title: {post.get('title', 'N/A')}")
    print(f"Engagement Score: {post.get('engagement_score', 0)}")
    print(f"Sentiment: {post.get('sentiment', 'N/A')}")  # Optional field
    print(f"Topic: {post.get('topic', 'N/A')}")  # Optional field
    print("-" * 50)

# Step 6: Save top 100 posts to a JSON file
with open("top_100_posts.json", "w", encoding="utf-8") as f:
    json.dump(top_100_posts, f, indent=4, ensure_ascii=False)

print("\nTop 100 posts saved to 'top_100_posts.json'.")

Top 100 Posts:

Rank 1:
Title: [pro/chef] macarons
Engagement Score: 28822
Sentiment: N/A
Topic: N/A
--------------------------------------------------

Rank 2:
Title: [homemade] bibimbap
Engagement Score: 1730
Sentiment: N/A
Topic: N/A
--------------------------------------------------

Rank 3:
Title: 2 dozen eggs at costco in los angeles $7.69
Engagement Score: 1548
Sentiment: N/A
Topic: N/A
--------------------------------------------------

Rank 4:
Title: sometimes you just need eggs
Engagement Score: 1057
Sentiment: N/A
Topic: N/A
--------------------------------------------------

Rank 5:
Title: a wrinkled egg from farm chickens by my house. i ate it.
Engagement Score: 643
Sentiment: N/A
Topic: N/A
--------------------------------------------------

Rank 6:
Title: as egg prices soar, trump administration plans new strategy to fight bird flu
Engagement Score: 591
Sentiment: N/A
Topic: N/A
--------------------------------------------------

Rank 7:
Title: about time we let the peop

In [37]:
import json
from transformers import pipeline
import torch
import logging
from tqdm import tqdm # Import tqdm for progress bar

# Configure logging
logging.basicConfig(filename='sentiment_analysis.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Load your JSON dataset
try:
    with open("processed_market_data.json", "r", encoding="utf-8") as file:
        data = json.load(file)
except FileNotFoundError as e:
    logging.error(f"File not found: {e}")
    raise
except json.JSONDecodeError as e:
    logging.error(f"JSON decode error: {e}")
    raise
except Exception as e:
    logging.error(f"Unexpected error loading data: {e}")
    raise

# Initialize models (move outside loop for efficiency)
try:
    # Load CUDA devices
    device = 0 if torch.cuda.is_available() else -1

    # Sentiment analysis model (small and fast)
    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased", device=device)
    # Topic classification model (specify a smaller model if possible)
    topic_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)
except Exception as e:
    logging.error(f"Error initializing models: {e}")
    raise

# Define candidate labels for topic classification
candidate_labels = ["economy", "technology", "health", "politics", "entertainment"]

# Batch size
batch_size = 3  # Adjust based on your resources

# Create batches of data to increase efficiency
for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
    batch = data[i:i + batch_size]

    # Extract text for batch
    batch_text = [post.get("title", "") + " ".join(comment.get("text", "") for comment in post.get("comments", [])) for post in batch]

    # Perform sentiment analysis for batch
    try:
        sentiment_results = sentiment_analyzer(batch_text)
        for post, sentiment_result in zip(batch, sentiment_results):
            post["sentiment"] = sentiment_result['label']
            post["sentiment_score"] = sentiment_result['score']
    except Exception as e:
        logging.warning(f"Error performing sentiment analysis for batch: {e}")
        for post in batch:
            post["sentiment"] = "N/A"
            post["sentiment_score"] = 0.0

    # Perform topic classification for batch
    try:
        topic_results = topic_classifier(batch_text, candidate_labels)
        for post, topic_result in zip(batch, topic_results):
            post["topic"] = topic_result['labels'][0]
    except Exception as e:
        logging.warning(f"Error performing topic classification for batch: {e}")
        for post in batch:
            post["topic"] = "N/A"

# Calculate engagement score (post score + sum of all comment scores)
for post in data:
    # Sum the scores of all comments
    total_comment_score = sum(comment.get("score", 0) for comment in post.get("comments", []))
    # Calculate engagement score
    engagement_score = post.get("score", 0) + total_comment_score
    # Add the engagement score to the post
    post["engagement_score"] = engagement_score

# Sort posts by engagement score (descending)
sorted_data = sorted(data, key=lambda x: x.get("engagement_score", 0), reverse=True)

# Select top 100 posts
top_100_posts = sorted_data[:100]

# Print top 100 posts to console
print("Top 100 Posts:")
for i, post in enumerate(top_100_posts):
    print(f"\nRank {i + 1}:")
    print(f"Title: {post.get('title', 'N/A')}")
    print(f"Engagement Score: {post.get('engagement_score', 0)}")
    print(f"Sentiment: {post.get('sentiment', 'N/A')} (Score: {post.get('sentiment_score', 0):.2f})")
    print(f"Topic: {post.get('topic', 'N/A')}")
    print("-" * 50)

# Save top 100 posts to a JSON file
try:
    with open("top_100_posts.json", "w", encoding="utf-8") as f:
        json.dump(top_100_posts, f, indent=4, ensure_ascii=False)
    print("\nTop 100 posts saved to 'top_100_posts.json'.")
except Exception as e:
    logging.error(f"Error saving data to JSON file: {e}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Batches:   0%|          | 0/150 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (683 > 512). Running this sequence through the model will result in indexing errors
Processing Batches: 100%|██████████| 150/150 [1:12:24<00:00, 28.97s/it]


Top 100 Posts:

Rank 1:
Title: [pro/chef] macarons
Engagement Score: 28822
Sentiment: N/A (Score: 0.00)
Topic: entertainment
--------------------------------------------------

Rank 2:
Title: [homemade] bibimbap
Engagement Score: 1730
Sentiment: LABEL_0 (Score: 0.54)
Topic: entertainment
--------------------------------------------------

Rank 3:
Title: 2 dozen eggs at costco in los angeles $7.69
Engagement Score: 1548
Sentiment: N/A (Score: 0.00)
Topic: technology
--------------------------------------------------

Rank 4:
Title: sometimes you just need eggs
Engagement Score: 1057
Sentiment: LABEL_0 (Score: 0.52)
Topic: entertainment
--------------------------------------------------

Rank 5:
Title: a wrinkled egg from farm chickens by my house. i ate it.
Engagement Score: 643
Sentiment: LABEL_0 (Score: 0.52)
Topic: health
--------------------------------------------------

Rank 6:
Title: as egg prices soar, trump administration plans new strategy to fight bird flu
Engagement Score: 5

In [2]:
#comment 
#reddit
import praw
import time
import logging

# Configure logging
logging.basicConfig(filename='reddit_scraper.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_reddit(subreddit_name, keywords, limit=100):
    """
    Scrapes Reddit for posts and comments related to the specified keywords.
    Args:
        subreddit_name (str): The name of the subreddit to scrape.
        keywords (list): A list of keywords to search for.
        limit (int): The maximum number of posts to retrieve.
    Returns:
        list: A list of dictionaries, each containing data from a relevant post.
    """
    try:
        # Authenticate with Reddit API.  Replace with your credentials
        user_agent = "scapper 1.0 by/u/Business-Till-1699"
        reddit = praw.Reddit(
            client_id='E7lnxu7KkXI9HWF0cWOFww', 
            client_secret='Ye__fRBy5a8753p62oIF9jdEt3j9Yw',
            user_agent=user_agent
        )

        subreddit = reddit.subreddit(subreddit_name)
        relevant_posts = []

        for submission in subreddit.search(query=" OR ".join(keywords), sort="relevance", limit=limit):
            post_data = {
                'title': submission.title,
                'url': submission.url,
                'author': str(submission.author),
                'subreddit': subreddit_name,
                'upvote_ratio': submission.upvote_ratio,
                'num_comments': submission.num_comments,
                'score': submission.score,
                'created': time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(submission.created)),
                'keywords': keywords,
                'comments': []
            }

            # Extract comments
            for comment in submission.comments.list():
                comment_data = {
                    'author': str(comment.author),
                    'body': comment.body,
                    'score': comment.score,
                    'created': time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(comment.created))
                }
                post_data['comments'].append(comment_data)

            relevant_posts.append(post_data)
            logging.info(f"Extracted post: {submission.title}")

        return relevant_posts

    except Exception as e:
        logging.error(f"An error occurred during scraping: {e}")
        return []

def main():
    # Define subreddit and keywords
    subreddit_name = "eggs"  # Adjust as needed
    keywords = [
    "organic_eggs", "pasture_raised", "cage_free", "sustainable_poultry", "farm_fresh_eggs",
    "organic_sweetener", "natural_sugar", "vegan_friendly", "farm_to_table", "locally_sourced"
    ]
    limit = 100  # Number of posts to extract

    # Scrape Reddit
    reddit_data = scrape_reddit(subreddit_name, keywords, limit)

    # Save data to JSON file
    try:
        with open("reddit_data.json", "w", encoding="utf-8") as f:
            json.dump(reddit_data, f, indent=4, ensure_ascii=False)
        print("Successfully saved data to reddit_data.json")
        logging.info("Successfully saved data to reddit_eggs_data.json")
    except Exception as e:
        logging.error(f"Error saving data to JSON file: {e}")

if __name__ == "__main__":
    main()


reddit 


In [9]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import praw
import json
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize Reddit API connection
user_agent = "market_research_bot 1.0"
reddit = praw.Reddit(
    client_id='E7lnxu7KkXI9HWF0cWOFww',
    client_secret='Ye__fRBy5a8753p62oIF9jdEt3j9Yw', 
    user_agent=user_agent
)

# Initialize sentiment analyzer
vader_analyzer = SentimentIntensityAnalyzer()

# Define search parameters
subreddits = ["eggs","farm made eggs","free range eggs"]
keywords = [
    "farmmade", "farm made eggs", "farm-made", "free-range eggs", "Free Range Eggs",
    "Farmmade", "Farm Made","eggs"
]
limit = 200  # Posts per keyword

market_data = []

# Function to calculate engagement score
def calculate_engagement(score, upvote_ratio, num_comments):
    return round((score * upvote_ratio + num_comments * 0.5) / 2, 2)

# Function to get basic sentiment score
def get_basic_sentiment(text):
    # Simple keyword-based sentiment scoring
    positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'best']
    negative_words = ['bad', 'poor', 'terrible', 'worst', 'hate', 'awful']
    
    text = text.lower()
    pos_count = sum(1 for word in positive_words if word in text)
    neg_count = sum(1 for word in negative_words if word in text)
    
    if pos_count == neg_count:
        return 0.0
    return round((pos_count - neg_count) / (pos_count + neg_count), 2)

# Search across multiple subreddits
for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    
    for keyword in keywords:
        try:
            # Search for market-related posts
            for submission in subreddit.search(keyword, limit=limit):
                
                # Compute sentiment score using VADER
                sentiment = vader_analyzer.polarity_scores(submission.title + " " + submission.selftext)
                sentiment_score = round(sentiment['compound'], 2)
                
                # Compute basic sentiment score as alternative to semantic score
                basic_sentiment = get_basic_sentiment(submission.title + " " + submission.selftext)
                
                # Compute engagement score
                engagement_score = calculate_engagement(submission.score, submission.upvote_ratio, submission.num_comments)
                
                post_data = {
                    'subreddit': subreddit_name,
                    'keyword': keyword,
                    'title': submission.title,
                    'content': submission.selftext,
                    'url': submission.url,
                    'post_id': submission.id,
                    'post_url': f"https://reddit.com{submission.permalink}",
                    'author': str(submission.author),
                    'score': submission.score,
                    'upvote_ratio': submission.upvote_ratio,
                    'num_comments': submission.num_comments,
                    'sentiment_score': sentiment_score,
                    'basic_sentiment': basic_sentiment,
                    'engagement_score': engagement_score,
                    'created': time.strftime('%Y-%m-%d %H:%M:%S', 
                                          time.localtime(submission.created_utc)),
                    'comments': []
                }

                # Get relevant comments discussing prices, supply, demand etc.
                submission.comments.replace_more(limit=0)
                for comment in submission.comments.list():
                    if any(kw in comment.body.lower() for kw in 
                          ['price', 'cost', 'market', 'supply', 'demand', 'shortage']):
                        
                        # Calculate scores for comment
                        comment_sentiment = vader_analyzer.polarity_scores(comment.body)
                        comment_basic = get_basic_sentiment(comment.body)
                        
                        comment_data = {
                            'author': str(comment.author),
                            'text': comment.body,
                            'score': comment.score,
                            'comment_id': comment.id,
                            'comment_url': f"https://reddit.com{comment.permalink}",
                            'sentiment_score': round(comment_sentiment['compound'], 2),
                            'basic_sentiment': comment_basic,
                            'created': time.strftime('%Y-%m-%d %H:%M:%S',
                                                   time.localtime(comment.created_utc))
                        }
                        post_data['comments'].append(comment_data)

                market_data.append(post_data)
            
            print(f"Collected data for keyword '{keyword}' in r/{subreddit_name}")
            time.sleep(2)  # Rate limiting
            
        except Exception as e:
            print(f"Error collecting data from r/{subreddit_name} for '{keyword}': {str(e)}")
            continue

# Save market research data
with open('reddit_data.json', 'w', encoding='utf-8') as f:
    json.dump(market_data, f, indent=4, ensure_ascii=False)

print(f"\nCollected {len(market_data)} relevant posts")
print("Market research data saved to reddit_data.json")


Collected data for keyword 'farmmade' in r/eggs
Collected data for keyword 'farm made eggs' in r/eggs
Collected data for keyword 'farm-made' in r/eggs
Collected data for keyword 'free-range eggs' in r/eggs
Collected data for keyword 'Free Range Eggs' in r/eggs
Collected data for keyword 'Farmmade' in r/eggs
Collected data for keyword 'Farm Made' in r/eggs
Collected data for keyword 'eggs' in r/eggs
Error collecting data from r/farm made eggs for 'farmmade': received 404 HTTP response
Error collecting data from r/farm made eggs for 'farm made eggs': received 404 HTTP response
Error collecting data from r/farm made eggs for 'farm-made': received 404 HTTP response
Error collecting data from r/farm made eggs for 'free-range eggs': received 404 HTTP response
Error collecting data from r/farm made eggs for 'Free Range Eggs': received 404 HTTP response
Error collecting data from r/farm made eggs for 'Farmmade': received 404 HTTP response
Error collecting data from r/farm made eggs for 'Farm M

In [None]:
# Set Reddit API credentials
import os

os.environ["REDDIT_CLIENT_ID"] = "E7lnxu7KkXI9HWF0cWOFww"
os.environ["REDDIT_CLIENT_SECRET"] = "Ye__fRBy5a8753p62oIF9jdEt3j9Yw" 
os.environ["REDDIT_USERNAME"] = "Flat_Ad5698"
os.environ["REDDIT_PASSWORD"] = "Chandra@6176"