# Libraries

In [24]:
import requests
import bs4 as bs
import spacy
import urllib.request
import re
nlp = spacy.load('en_core_web_sm')
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import socket
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph

# Scraping

In [25]:
# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Canoo"
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
# Find external links
external_links = soup.find_all("a", href=True,class_='external text')
website=[]# to append all the links
for links in external_links:
    href = links['href']
    website.append(href)    

In [26]:
#function to scrape data from the link after cleaning the text
def find_para(link):
    try:
        # Set timeout for the request
        timeout = 60  # 60 seconds
        socket.setdefaulttimeout(timeout)
        
        source = urllib.request.urlopen(link).read()
        soup = bs.BeautifulSoup(source, 'html.parser')
        
        # Check if there are any paragraphs
        if not soup.find_all('p'):
            return None, None
        
        text = ""
        for paragraph in soup.find_all('p'):
            text += paragraph.text 

        # Remove non-alphanumeric characters except for '.', ',', and '''
        text = re.sub(r'[^a-zA-Z0-9,.\'\s]', ' ', text)
        text = re.sub(r'\b\d+\b', '', text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text)

        # Process text with spaCy
        doc = nlp(text)

        # Tokenize text and filter out stop words and punctuation
        words_list = [token.text for token in doc if not token.is_stop and not token.is_punct]

        # Join the words back into a clean text
        cleaned_text = ' '.join(words_list)

        # Split text into sentences
        sentences = [sent.text for sent in doc.sents]
        
        return link, sentences
    except urllib.error.URLError as e:
        #print(f"URL Error processing {link}: {e}")
        return None, None
    except socket.timeout:
        #print(f"Timeout processing {link}")
        return None, None
    except Exception as e:
        #print(f"Error processing {link}: {e}")
        return None, None


In [27]:
# List to store website URL and corresponding text
website_text = []
website_url=[]
link, sentences = find_para('https://en.wikipedia.org/wiki/Canoo')
if link is not None and sentences is not None:
        website_url.append(link)
        website_text.append(sentences)

In [28]:
# Process each URL
for url in website:
    link, sentences = find_para(url)
    if link is not None and sentences is not None:
        website_url.append(link)# apending the link
        website_text.append(sentences)# apending the paragraphs

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


# Saving Data

In [29]:
# Save DataFrame to CSV file
df = pd.DataFrame({"Link": [val for val in website_url], "Sentences": [val for val in website_text]})
df.to_csv("output.csv", index=False)

In [30]:
df.head(5)

Unnamed: 0,Link,Sentences
0,https://en.wikipedia.org/wiki/Canoo,[ Canoo is an American automotive company base...
1,https://en.wikipedia.org/w/index.php?title=Can...,[Copy and paste Cite your sources ref ref Cate...
2,https://investors.canoo.com/,[Canoo has developed breakthrough electric veh...
3,https://web.archive.org/web/20210628182454/htt...,[Los Angeles based electric vehicle startup Ca...
4,https://www.theverge.com/2021/5/17/22440287/ca...,[By Sean O'KaneCalifornia EV startup Canoo has...


In [31]:
#storing all the text in a string to convert it into pdf
text=[val for val in df.Sentences]
string = '\n'.join(', '.join(sublist) for sublist in website_text)
print(string)

 Canoo is an American automotive company based in Torrance, California, that develops and manufactures electric vehicles., Canoo's research development team is based in Michigan, in the Detroit region Auburn Hills, Livonia , and production operations are in Justin, Texas., The company also plans to produce commercial electric vehicles such as vans for fleet, vehicle rental and ride sharing services., Canoo was founded in under the name Evelozcity by Stefan Krause and Ulrich Kranz., Krause worked for Deutsche Bank as its chief financial officer while Kranz worked for BMW as a senior executive., Both men met at rival EV company Faraday Future before leaving together to form their own company in , due to disagreement with Faraday Future's leadership., Krause took on the role of chief executive officer at Evelozcity, and Kranz became chief technology officer., The company received its primary funding from Chinese investor Li David Pak Tam Botan and German entrepreneur David Stern., In Apri

Data to pdf

In [32]:
#converting string to pdf
def convert_to_pdf(text, filename):
    # Set up the PDF document
    doc = SimpleDocTemplate(filename, pagesize=letter)
    styles = getSampleStyleSheet()
    # Create a Paragraph object with the text
    content = [Paragraph(text, styles["Normal"])]
    # Build the PDF
    doc.build(content)
# Convert the text to PDF
convert_to_pdf(string, "output.pdf")