In [None]:
import requests
from bs4 import BeautifulSoup as bs
import time
import pandas as pd 

# Define a function to scrape the article content from an article link
def scrape_article_content(article_link, session):
    try:
        response = session.get(article_link, timeout=10)  # Set a timeout
        if response.status_code != 200:
            print(f"Failed to retrieve {article_link}, status code: {response.status_code}")
            return "N/A"

        # Parse the HTML content
        data = bs(response.content, 'html.parser')

        # Find all divs and then filter for class 'cooked'
        divs = data.find_all('div', class_='post')
        
        if divs:
            # Assuming we want the first occurrence of div with class 'cooked'
            article_content = divs[0].get_text(strip=True)
        else:
            article_content = "N/A"
            
        return article_content
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve content from {article_link}: {e}")
        return "N/A"

# Define a function to scrape all articles from all categories and save in one file
def scrape_all_article_content():
    session = requests.Session()  # Create a session object for persistent connection
    session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})  # Add User-Agent
    
    output_file = 'all_articles_content.csv'  # Output file to store all data

    # Open the file in write mode and add headers
    with open(output_file, 'w') as f:
        f.write("Category Name,Article Name,Article Link,Article Content\n")  # CSV headers

    for index, row in df.iterrows():
        category_name = row['Category Name']
        category_link = row['Category Link']

        # Ensure the URL is properly formed
        if not category_link.startswith("http"):
            category_link = f"https://gov.optimism.io{category_link}"

        response = None

        # Retry logic with exponential backoff
        for attempt in range(1, 4):  # Retry up to 3 times
            try:
                response = session.get(category_link, timeout=10)  # Set a timeout
                break
            except requests.exceptions.RequestException as e:
                print(f"Attempt {attempt}: RequestException for {category_link}: {e}. Retrying in {5 * attempt} seconds...")
                time.sleep(5 * attempt)  # Exponential backoff

        if response is None:
            print(f"Failed to retrieve {category_link} after 3 attempts.")
            continue

        # Parse the HTML content
        data = bs(response.content, 'html.parser')
        articles = data.find_all('tr', class_='topic-list-item')

        for article in articles:
            article_name_tag = article.find('a', class_='title')
            article_name = article_name_tag.get_text(strip=True) if article_name_tag else "N/A"
            article_link = article_name_tag['href'] if article_name_tag else "N/A"

            # If the article link is relative, prepend the base URL
            if article_link.startswith('/'):
                article_link = f"https://gov.optimism.io{article_link}"

            article_content = scrape_article_content(article_link, session)

            # Save the data to the single CSV file
            with open(output_file, 'a') as f:
                f.write(f"{category_name},{article_name},{article_link},{article_content}\n")

            print(f"Content for article '{article_name}' saved to {output_file}.")

scrape_all_article_content()


NameError: name 'df' is not defined

In [4]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyCgsv4OA_ZlV-73_HN_Z1Jfhdtn5mibWq0")
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content("Write a story about a magic backpack.")
print(response.text)

In [5]:
print(response)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Why don't scientists trust atoms?\n\nBecause they make up everything! \n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "index": 0,
          "safety_ratings": [
            {
              "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HATE_SPEECH",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HARASSMENT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
              "probability": "NEGLIGIBLE"
            }
          ]
        }

In [None]:
import pandas as pd
import google.generativeai as genai

# Configure the Gemini API
genai.configure(api_key="AIzaSyCgsv4OA_ZlV-73_HN_Z1Jfhdtn5mibWq0")  # Replace with your API key
model = genai.GenerativeModel("gemini-1.5-flash")

df = pd.read_csv("all_articles_content.csv")

# Function to summarize article content
def summarize_article(article_content):
    prompt = f"You are an expert in writing and have to summarize this content: {article_content}"
    response = model.generate_content(prompt)
    return response.text

# Create a new column for summaries
df['Summary'] = df['Article'].apply(summarize_article)

# Save the updated DataFrame back to Excel
output_file_path = 'summarized_articles.xlsx'  # Specify the output file name
df.to_excel(output_file_path, index=False)

print("Summaries have been generated and saved to", output_file_path)
