In [1]:
import pandas as pd
import numpy as np
import requests as req
from bs4 import BeautifulSoup
import re
import os
import spacy

In [2]:
def replace_multiple_whitespace_logic(match_group):
    if "\n" in match_group[0]:
        return "\n"
    else:
        return " "

def format_text(text):
    # remove multiple consecutve whitespaces or new lines
    return re.sub(r"[ \n]*\n[ \n]*|[ \t\r\f\v]{2, }", replace_multiple_whitespace_logic, text)

def process_html_page(url, timeout=2):
    r = req.get(url, timeout=timeout)
    soup = BeautifulSoup(r.text, "html.parser")
    raw_text = soup.get_text()
    trimmed_text = raw_text.strip(" \n")
    return format_text(trimmed_text)

In [3]:
furniture_pages_path = "furniture_stores_pages.csv"
furniture_pages = pd.read_csv(furniture_pages_path, header=0, names=["url"])
furniture_pages.head()

Unnamed: 0,url
0,https://www.factorybuys.com.au/products/euro-t...
1,https://dunlin.com.au/products/beadlight-cirrus
2,https://themodern.net.au/products/hamar-plant-...
3,https://furniturefetish.com.au/products/oslo-o...
4,https://hemisphereliving.com.au/products/


In [4]:
furniture_pages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     704 non-null    object
dtypes: object(1)
memory usage: 5.6+ KB


In [22]:
data_path = os.path.join(os.getcwd(), "data")
text_data_path = os.path.join(data_path, "text")
tokenized_text_data_path = os.path.join(data_path, "tokenized.csv")

def create_data_dirs():
    os.makedirs(text_data_path)
    print(f"Create directory \"{text_data_path}\"")

def sample_furniture_pages(furniture_pages, num_samples):
    if furniture_pages.size > num_samples:
        sampled_furniture_pages = furniture_pages.sample(n=num_samples)
    else:
        sampled_furniture_pages = furniture_pages
    furniture_pages = furniture_pages.loc[~furniture_pages.index.isin(sampled_furniture_pages.index)]
    return sampled_furniture_pages, furniture_pages

def should_generate_new_data():
    return not os.path.isdir(text_data_path)

def generate_new_data(furniture_pages, num_samples, max_count_attempts):
    # try to sample num_samples but there might be many broken url in our samples so
    # it might be necessary to sample multiple times but no more than max_count_attempts
    print("Start web scarping to generate new data samples")
    create_data_dirs()
    count = 0
    count_attempts = 0
    count_total_samples = 0
    furniture_pages_copy = furniture_pages.copy()
    while (num_samples - count > 0 and
           count_attempts < max_count_attempts and
           not furniture_pages_copy.empty):
        sampled_furniture_pages, furniture_pages_copy = sample_furniture_pages(furniture_pages_copy, num_samples - count)
        count_total_samples += sampled_furniture_pages.size
        for _, url in sampled_furniture_pages.itertuples():
            try:
                text = process_html_page(url)
                with open(os.path.join(text_data_path, f"{count + 1}.txt"), "w") as file:
                    file.write(text)
                    count += 1
            except req.RequestException as e:
                continue
        count_attempts += 1
                
    print(f"Sampled {count_total_samples}")
    print(f"{count} files were written in directory \"{text_data_path}\"")
    print(f"{count} files were written in directory \"{tokenized_text_data_path}\"")

def flatten(list):
    result = []
    for sublist in list:
        for item in sublist:
            result.append(item)
    return result

def should_preprocess_data():
    return os.path.isdir(text_data_path) and not os.path.isfile(tokenized_text_data_path)

def tokenize_files(file_paths):
    nlp = spacy.load("en_core_web_sm")
    tokens = []
    for file_path in file_paths:
        with open(file_path) as file:
            text = file.read()
            doc = nlp(re.sub("\n", " ", text))
            file_tokens = [token.text for token in doc]
            if file_tokens:
                file_tokens.append("END_SEQUENCE")
                tokens.append(file_tokens)
    return flatten(tokens)
    
def preprocess_data():
    file_paths = [os.path.join(text_data_path, file) for file in os.listdir(text_data_path)]
    tokens = tokenize_files(file_paths)
    print(f"Read {len(file_paths)} files from directory \"{text_data_path}\"")
    df = pd.DataFrame({"word": tokens, "tag": np.nan})
    df.to_csv(tokenized_text_data_path)
    print(f"Tokenized data have been written in file \"{tokenized_text_data_path}\"")

def main():
    if should_generate_new_data():
        generate_new_data(furniture_pages, num_samples=100, max_count_attempts=4)
    if should_preprocess_data():
        preprocess_data()

When there isn't a directory "data/text", try to sample 100 urls from "furniture_stores_pages.csv" and get those web pages. The preprocessed html pages are written in the dirctory "data/text". Multiple consecutive white spaces and new lines were removed.

When there isn't a file "data/tokenized.csv", read the files from the dirctory "data/text" and tokenized them. The result is written in the file "data/tokenized.csv". A file is considered a sequence of text which ending is marked by the tag "END_SEQUENCE".

In [23]:
main()

Read 100 files from directory "/home/eduard/residence/projects/ml/product_extraction/data/text"
Tokenized data have been written in file "/home/eduard/residence/projects/ml/product_extraction/data/tokenized.csv"
