In [37]:
import requests
from bs4 import BeautifulSoup
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import os
from PyPDF2 import PdfReader
import re
import email 

In [55]:
# Function to scrape blog content from a website
def scrape_blogs(url):
    base_url = "https://www.greatplacetowork.com"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    blogs = []
    for blog in soup.find_all('a', class_='link', href=True):
        blog_url = blog['href']
        if blog_url.startswith('/resources/blog'):
            blog_url = f"{base_url}{blog_url}"
            print("Found blog link:", blog_url)  # Print the found blog link
            
            # Scrape content from blog page
            blog_title, blog_content = scrape_blog_content(blog_url)
            if blog_content:
                print("Found blog content for:", blog_title)  # Print the found blog title
                blogs.append(blog_content.strip())
            else:
                print("Blog content not found.")
        else:
            print("Not a blog link.")

    return blogs

# Function to scrape content from individual blog pages
def scrape_blog_content(blog_url):
    response = requests.get(blog_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract blog title from the <title> tag
    title_tag = soup.find('title')
    if title_tag:
        blog_title = title_tag.text.strip()
        print("Found blog title:", blog_title)
    else:
        print("Blog title not found.")
        blog_title = ""
    
    # Find and extract text content from <p>, <h3>, <strong>, etc.
    content_elements = soup.find_all(['p', 'h3', 'strong'])
    content_text = ' '.join([element.get_text() for element in content_elements])
    
    return blog_title, content_text

# Function to scrape content from a set of links
def scrape_links(links):
    contents = []
    for link in links:
        response = requests.get(link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Customize this based on the structure of the content you want to scrape from the links
            # For example:
            # content = soup.find('div', class_='content').get_text()
            content = soup.get_text()
            contents.append(content.strip())
        else:
            print("Failed to retrieve content from link:", link)
    return contents

# Function to scrape text content from PDF files in a folder
def scrape_pdf_content(folder_path):
    pdf_contents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            print("Scraping content from PDF:", pdf_path)
            
            with open(pdf_path, "rb") as pdf_file:
                pdf_reader = PdfReader(pdf_file)
                text = ""
                for page_num in range(len(pdf_reader.pages)):
                    text += pdf_reader.pages[page_num].extract_text()
                
                pdf_contents.append(text.strip())
    
    return pdf_contents

# Function to scrape text content from emails in a folder
def scrape_email_content(folder_path):
    email_contents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".eml"):  # Change the file extension to .eml
            email_path = os.path.join(folder_path, filename)
            print("Scraping content from email:", email_path)
            
            with open(email_path, "r") as email_file:
                msg = email.message_from_file(email_file)
                # Extract text content from the email
                email_content = ""
                for part in msg.walk():
                    if part.get_content_type() == "text/plain":
                        email_content += part.get_payload()
                
                email_contents.append(email_content.strip())
    
    return email_contents

# Function to fine-tune GPT-2 model on scraped blog content
def fine_tune_model(scraped_content):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Tokenize content
    tokenized_content = [tokenizer.encode(content, return_tensors="pt", max_length=512, truncation=True) for content in scraped_content]

    # Fine-tune model
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    for epoch in range(1000):  # adjust number of epochs as needed
        for batch in tokenized_content:
            optimizer.zero_grad()
            outputs = model(input_ids=batch, labels=batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
    
    return model

# Function to generate response given input text
def generate_response(input_text, model, tokenizer):
    # Tokenize input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)

    # Generate response
    with torch.no_grad():
        output = model.generate(input_ids, attention_mask=attention_mask, max_length=100, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1)

    # Decode response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response


In [56]:
# Main function
if __name__ == "__main__":
    url = "https://www.greatplacetowork.com/resources/all-insights"
    num_pages = 90  # Adjust this based on the total number of pages

    # Scraping blogs from multiple pages
    all_blogs = []
    for i in range(num_pages):
        page_url = f"{url}?start={i * 12}"
        print("Scraping page:", page_url)
        page_blogs = scrape_blogs(page_url)
        all_blogs.extend(page_blogs)

    # Scrape content from a set of links
    links = [
        "https://cloud.kapostcontent.net/pub/2f4c3763-0315-4f06-a947-0213596c2f04/2020-worlds-best-workplaces-rising-to-historic-challenges?kui=WXQe4R4ni4cOBETV8ABaEg",
        "https://cloud.kapostcontent.net/pub/3c6877ba-67b9-400d-ad8c-439b609d9999/the-future-of-work-is-for-all-2020-fortune-100-best-companies-report?kui=8V47uCOLuNVcSP_ZMaYlpQ",
        "https://cloud.kapostcontent.net/pub/553e8f75-a21d-4551-b42a-676840a73323/d-and-i-puzzle?kui=-U-DLNYkmOsDDvb7rhMjFQ"
    ]
    scraped_links = scrape_links(links)

    # Scrape text content from PDF files in a folder
    pdf_folder_path = "C:/Users/chand/OneDrive/Documents 1/chatgptw/pdfs_training"
    scraped_pdfs = scrape_pdf_content(pdf_folder_path)

    # Scrape text content from emails in a folder
    email_folder_path = "C:/Users/chand/OneDrive/Documents 1/chatgptw/emails_scrapping"
    scraped_emails = scrape_email_content(email_folder_path)

    # Combine all scraped content
    all_scraped_content = all_blogs + scraped_links + scraped_pdfs + scraped_emails

    # Fine-tune GPT-2 model on scraped content
    model = fine_tune_model(all_scraped_content)

Scraping page: https://www.greatplacetowork.com/resources/all-insights?start=0
Found blog link: https://www.greatplacetowork.com/resources/blog/how-return-to-office-mandates-pose-risks-productivity-wellbeing-retention
Found blog title: How Return-to-Office Mandates Pose Risks to Productivity, Well-Being, and Retention | Great Place To Work®
Found blog content for: How Return-to-Office Mandates Pose Risks to Productivity, Well-Being, and Retention | Great Place To Work®
Found blog link: https://www.greatplacetowork.com/resources/blog/how-leaders-at-great-workplaces-develop-and-grow-talent
Found blog title: How Leaders at Great Workplaces Develop and Grow Talent | Great Place To Work®
Found blog content for: How Leaders at Great Workplaces Develop and Grow Talent | Great Place To Work®
Not a blog link.
Not a blog link.
Not a blog link.
Not a blog link.
Not a blog link.
Found blog link: https://www.greatplacetowork.com/resources/blog/why-job-seekers-prefer-certified-workplaces
Found blog 

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Scraping content from PDF: C:/Users/chand/OneDrive/Documents 1/chatgptw/pdfs_training\how-to-create-an-innovation-by-all-culture-innovation-insights-4.pdf
Scraping content from PDF: C:/Users/chand/OneDrive/Documents 1/chatgptw/pdfs_training\innovation-by-all-innovation-insights-1.pdf
Scraping content from PDF: C:/Users/chand/OneDrive/Documents 1/chatgptw/pdfs_training\innovation-everywhere-innovation-insights-3.pdf
Scraping content from PDF: C:/Users/chand/OneDrive/Documents 1/chatgptw/pdfs_training\the-five-hidden-barriers-to-innovation-innovation-insights-2.pdf
Scraping content from PDF: C:/Users/chand/OneDrive/Documents 1/chatgptw/pdfs_training\women-in-the-workplace-2019-best-workplaces-for-women-report.pdf


KeyboardInterrupt: 

In [53]:
# Example usage
input_text = "what are the leadership behaviors?"
response = generate_response(input_text, model, tokenizer)
print("Generated response:", response)

Generated response: what is work?


The answer is that the work is a process.

The process of creating a work is a process.

The process of creating a work is a process of creating a product.

The process of creating a product is a process of creating a product.

The process of creating a product is a process of creating a product.

The process of creating a product is a process of creating a product.

The process of creating a
