In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
# Custom transformer for fetching HTML content
class FetchHTMLTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        response = requests.get(X)
        if response.status_code == 200:
            return response.content
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}")
            return None

In [None]:
# Custom transformer for parsing HTML and extracting data
class ParseQuotesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        soup = BeautifulSoup(X, "html.parser")
        quotes = []
        for quote in soup.find_all("div", class_="quote"):
            text = quote.find("span", class_="text").text
            author_element = quote.find("span", class_="author")
            author = author_element.text if author_element else "None one"
            tags = [tag.text for tag in quote.find_all("a", class_="tag")]
            quotes.append({"Text": text, "Author": author, "Tags": tags})
        return quotes

In [None]:
if __name__ == "__main__":
    url = "https://quotes.toscrape.com"

    pipeline = Pipeline([
        ('fetch_html', FetchHTMLTransformer()),
        ('parse_quotes', ParseQuotesTransformer())
    ])

    quotes = pipeline.fit_transform(url)

# Checking for my Pipeline

In [None]:
pipeline

# Checking for quotes

In [None]:
quotes

[{'Text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
  'Author': 'None one',
  'Tags': ['change', 'deep-thoughts', 'thinking', 'world']},
 {'Text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
  'Author': 'None one',
  'Tags': ['abilities', 'choices']},
 {'Text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
  'Author': 'None one',
  'Tags': ['inspirational', 'life', 'live', 'miracle', 'miracles']},
 {'Text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
  'Author': 'None one',
  'Tags': ['aliteracy', 'books', 'classic', 'humor']},
 {'Text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
  'Author': 'None one',
  'Tags': ['be-yourself', 'inspirational']

In [None]:
df = pd.DataFrame(quotes)
df.head()

Unnamed: 0,Text,Author,Tags
0,“The world as we have created it is a process ...,None one,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",None one,"[abilities, choices]"
2,“There are only two ways to live your life. On...,None one,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",None one,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",None one,"[be-yourself, inspirational]"


In [None]:
df.shape

(10, 3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    10 non-null     object
 1   Author  10 non-null     object
 2   Tags    10 non-null     object
dtypes: object(3)
memory usage: 368.0+ bytes
