#**Milestone 1: Create the Dataset**

*   Parse websites/html-files
*   Extract the content (title, description) using BeautifulSoup
*   Import the labels from csv
*   Merge data into a DataFrame
*   Export data to a .csv-file





### **Setting up the environment**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# beautifulsoup to parse html-files
!pip install beautifulsoup4



###**Importing the required modules**

In [None]:
# import relevant libraries
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import csv
import shutil

### **Preparing train data**

In [None]:
# make directory for all html-files
new_dir = '/content/drive/MyDrive/Webshop_classification/data/train_pred_html'
os.mkdir(new_dir)

In [None]:
# make directory for the train dataset
train_dir = os.path.join(new_dir, 'train')
os.mkdir(train_dir)

# make diretory for the prediction dataset
test_dir = os.path.join(new_dir, 'pred')
os.mkdir(test_dir)

**Preparing files from directory**

In [None]:
# move the training files into the corresponding folder

# directory where to store the html-files for the train
new_dir_train = r'/content/drive/MyDrive/Webshop_classification/data/train_pred_html/train/'

# directory where all html-files have been stored so far
original_dir = r'/content/drive/MyDrive/Webshop_classification/data/scraped_html/'

# path of the excel-file containing the list of the training data
csv_file_train = r'/content/drive/MyDrive/Webshop_classification/data/dataset1.csv'

# select all html-files for training and move them into the corresponding folder
with open(csv_file_train, 'r', newline='') as f:
  reader = csv.reader(f)
  for index, row in enumerate(reader):
    if index > 0:
      filename = row[0] + '.html'
    
      from_filename = os.path.join(original_dir, filename)
      to_filename = os.path.join(new_dir_train, filename)
      
      try:
        shutil.copy2(from_filename, to_filename)
        print("Moved - '{}' -> '{}'".format(from_filename, to_filename))
      except shutil.Error as e:
        print("Failed - '{}' -> '{}'".format(from_filename, to_filename))

Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/111.com.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/train/111.com.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/12xl.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/train/12xl.de.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/1a-buerotechnik.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/train/1a-buerotechnik.de.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/1a-yachtcharter.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/train/1a-yachtcharter.de.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/1blu.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/train/1blu.de.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scrape

In [None]:
# check the number of files
print('total files for training: ', len(os.listdir(new_dir_train)))

total files for training:  860


**Parsing and extraction**

In [None]:
# directory where the files for training are stored
directory_train = r'/content/drive/MyDrive/Webshop_classification/data/train_pred_html/train/'

In [None]:
# get all filenames that are stored in the directory
def readFilesnamesFromDirectory(directory):
  fn_results = []
  files = []
  for dirname, dirnames, filenames in os.walk(directory):
    for filename in sorted(filenames):
      fn_results.append(os.path.join(dirname, filename))
      files.append(filename)
  return fn_results, files

In [None]:
# get filenames
fn_results_train, files_train = readFilesnamesFromDirectory(directory=directory_train)
print('There are {} files in the directory'.format(len(fn_results_train)))

There are 860 files in the directory


In [None]:
# read titles from htlm-files and return an array containing the title
def readHTMLTitlesFromFile(fn_results):
  html_titles = []

  for file in fn_results:
    contents = open(file, 'r', encoding='utf-8')
    html = BeautifulSoup(contents,'html5lib')

    title = [] 

    # find page's meta tags (open graph meta tags) / headings and extract content
    # og: snippets of code that control how URLs are displayed when shared on social media
    try:
      if html.title.string:
        title = html.title.string
      elif html.find("meta", property="og:title"):
        title = html.find("meta", property="og:title").get('content')
      elif html.find("meta", property="twitter:title"):
        title = html.find("meta", property="twitter:title").get('content')
      elif html.find("h1"):
        title = html.find("h1").string
    except (AttributeError, KeyError):
      title = ""
    
    html_titles.append(title)
  print(html_titles)
  return html_titles

In [None]:
# get the titles
html_titles_train = readHTMLTitlesFromFile(fn_results=fn_results_train)

['dns-net.ch | DNS-NET Services GmbH', 'Bekleidung  in Übergrößen für Herren |  Herrenmode Online-Shop | 12xl.de ', 'Lastschrift, Kreditkarte und Rechnungskauf ohne Paypal-Konto\xa0-\xa01a-Bürotechnik Ihr Discount-Versand, schnell-kompetent-preiswert', 'Yachtcharter - 9196 Yachten online chartern', 'Neue Internetpräsenz.\n', '1&1 | Europas größter Hoster für Websites, Domains und Mail', '4Taktershop - Baotian und Rex Roller Ersatzteile, Benzhou Tuning-Teile und 4Takt Zubehör', '5 Sterne Yachtcharter ', 'Last Minute Urlaub 5vorFlug® - Lastminute Flüge & Schnäppchenreisen', 'A&A REISEN - BALATON, URLAUB, FERIENWOHNUNG, FERIENHÄUSER, PLATTENSEE, UNGARN, TSCHECHIEN, RIESENGEBIRGE', 'A1TQ-Shop Sportbekleidung fuer Outdoor, Regen, Freizeit', 'Urlaub: Bis 40% Rabatt beim Marktführer | ab-in-den-urlaub.de', 'Urlaub und Reisen für 2017/2018 günstig bei AB INS BLAUE buchen', 'Birkenstock online kaufen  | aboutshoes.com', 'Schmuck Uhren Online günstig kaufen - Ch. Abramowicz', 'Herren Übergrößen 

In [None]:
# check the number of titles
print('The list contains {} titles'.format(len(html_titles_train)))

The list contains 860 titles


In [None]:
# make each entry a string object
html_titles_train = [str(s) for s in html_titles_train]

In [None]:
# read text content from htlm-files and return an array containing the content
def readHTMLContentFromFile(fn_results):
  html_contents = []
  
  for file in fn_results:
    contents = open(file, 'r', encoding='utf-8')
    html = BeautifulSoup(contents,'html5lib')

    # finds all the tags containing paragraph tag <p></p> and extracht text
    try: 
      content = html.find('p').get_text(' ', strip=True)

    except (AttributeError, KeyError):
      content = ""
      
    html_contents.append(content)
  return html_contents

In [None]:
# get the text content
html_content_train = readHTMLContentFromFile(fn_results=fn_results_train)
print(html_content_train)

['Über DNS-NET', 'Willkommen im Online-Shop für Übergrößen von Honeymoon. Hier finden Sie Herrenmode für den modernen und stilvollen Mann bis Größe 15XL. Wir bieten Ihnen eine große Auswahl an Herren Bekleidung. Suchen Sie einfach in unserem Online-Shop nach der passenden Herrenmode für Ihren Geschmack. Wir sind uns sicher, da ist auch für Sie etwas dabei!', 'AVerVision F17-8M Full HD, AVer...', 'Alle Reviere und Länder', '', '.de Neu-Kunden Angebot', 'Diese Webseite benutzt Cookies, damit wir Ihnen das bestmögliche Shoppingerlebnis bieten können.', 'Mehr zu unserem Angebot', 'Hotline 089 - 71045 4109 täglich 8 - 23 Uhr (inkl. Sonn- und Feiertage)', '', 'Diesen Artikel bookmarken bei', 'Ortstarif, Mobilfunk abweichend', '', 'Unsere Schuhe haben das Potenzial, dein neuer Lieblingsschuh zu werden. Was sich wie ein paar hochtrabende Worte anhört, ist bei aboutshoes.com Fakt. Bei uns dreht sich alles um das eine Schuhpaar. Die Schuhe, die du gar nicht mehr ausziehen willst. Die du am liebs

In [None]:
# check the number of contents
print('The list contains {} descriptions'.format(len(html_content_train)))

The list contains 860 descriptions


In [None]:
# create a DataFrame containing files, title and text content
df_train = pd.DataFrame(list(zip(files_train, html_titles_train, html_content_train)), columns = ['Website', 'Title', 'Description'])
df_train.head(20)

Unnamed: 0,Website,Title,Description
0,111.com.html,dns-net.ch | DNS-NET Services GmbH,Über DNS-NET
1,12xl.de.html,Bekleidung in Übergrößen für Herren | Herren...,Willkommen im Online-Shop für Übergrößen von H...
2,1a-buerotechnik.de.html,"Lastschrift, Kreditkarte und Rechnungskauf ohn...","AVerVision F17-8M Full HD, AVer..."
3,1a-yachtcharter.de.html,Yachtcharter - 9196 Yachten online chartern,Alle Reviere und Länder
4,1blu.de.html,Neue Internetpräsenz.\n,
5,1und1.info.html,"1&1 | Europas größter Hoster für Websites, Dom...",.de Neu-Kunden Angebot
6,4taktershop.de.html,4Taktershop - Baotian und Rex Roller Ersatztei...,"Diese Webseite benutzt Cookies, damit wir Ihne..."
7,5sterne-yachtcharter.de.html,5 Sterne Yachtcharter,Mehr zu unserem Angebot
8,5vorflug.de.html,Last Minute Urlaub 5vorFlug® - Lastminute Flüg...,Hotline 089 - 71045 4109 täglich 8 - 23 Uhr (i...
9,a-a-reisen.de.html,"A&A REISEN - BALATON, URLAUB, FERIENWOHNUNG, F...",


**Adding Labels**

In [None]:
# path of the csv-file containing the labels
csv_file_train = r'/content/drive/MyDrive/Webshop_classification/data/dataset1.csv'

# read the labels from the csv and returning an array containing the filenames and corresponding labels
def import_labels(csv_file):
  with open(csv_file_train, 'r', newline='') as f:
    reader = csv.reader(f)
    files = []
    labels = []
    for index, row in enumerate(reader):
      if index > 0:
        file = row[0] + '.html'
        label = row[1]
        files.append(file)
        labels.append(label)
  return files, labels

In [None]:
# getting filenames and labels
files_train, labels = import_labels(csv_file=csv_file_train)
print(files_train[:10])

# check the size of files and labels
print('The list contains {} files and {} labels.'.format(len(files_train), len(labels)))

['111.com.html', '12xl.de.html', '1a-buerotechnik.de.html', '1a-yachtcharter.de.html', '1blu.de.html', '1und1.info.html', '4taktershop.de.html', '5sterne-yachtcharter.de.html', '5vorflug.de.html', 'a-a-reisen.de.html']
The list contains 860 files and 860 labels.


In [None]:
# add the labels to the DataFrame
df_train['Labels'] = labels
df_train.head(20)

Unnamed: 0,Website,Title,Description,Labels
0,111.com.html,dns-net.ch | DNS-NET Services GmbH,Über DNS-NET,0
1,12xl.de.html,Bekleidung in Übergrößen für Herren | Herren...,Willkommen im Online-Shop für Übergrößen von H...,1
2,1a-buerotechnik.de.html,"Lastschrift, Kreditkarte und Rechnungskauf ohn...","AVerVision F17-8M Full HD, AVer...",1
3,1a-yachtcharter.de.html,Yachtcharter - 9196 Yachten online chartern,Alle Reviere und Länder,0
4,1blu.de.html,Neue Internetpräsenz.\n,,0
5,1und1.info.html,"1&1 | Europas größter Hoster für Websites, Dom...",.de Neu-Kunden Angebot,0
6,4taktershop.de.html,4Taktershop - Baotian und Rex Roller Ersatztei...,"Diese Webseite benutzt Cookies, damit wir Ihne...",1
7,5sterne-yachtcharter.de.html,5 Sterne Yachtcharter,Mehr zu unserem Angebot,0
8,5vorflug.de.html,Last Minute Urlaub 5vorFlug® - Lastminute Flüg...,Hotline 089 - 71045 4109 täglich 8 - 23 Uhr (i...,0
9,a-a-reisen.de.html,"A&A REISEN - BALATON, URLAUB, FERIENWOHNUNG, F...",,0


In [None]:
# check the shape of the DataFrame
print(df_train.shape)

(860, 4)


**Building text corpus**

In [None]:
# concatenate titles and descriptions to a single string
df_train['Text Corpus'] = [html_titles_train[i] +' ' + html_content_train[i] for i in range(0, len(html_titles_train))]

In [None]:
# check the first 5 entries
print(df_train['Text Corpus'][:5])

0      dns-net.ch | DNS-NET Services GmbH Über DNS-NET
1    Bekleidung  in Übergrößen für Herren |  Herren...
2    Lastschrift, Kreditkarte und Rechnungskauf ohn...
3    Yachtcharter - 9196 Yachten online chartern Al...
4                             Neue Internetpräsenz.\n 
Name: Text Corpus, dtype: object


**Exporting the DataFrame to csv**

In [None]:
# export the DataFrame to csv
df_train.to_csv('/content/drive/MyDrive/Webshop_classification/data/train_dataset.csv', index=False)

### **Preparing prediction data**

In [None]:
# move the prediction files into the corresponding folder

# directory where to store the html-files for the prediction
new_dir_pred = r'/content/drive/MyDrive/Webshop_classification/data/train_pred_html/pred/'

# directory where all html-files have been stored so far
original_dir = r'/content/drive/MyDrive/Webshop_classification/data/scraped_html/'

# path of the csv-file containing the list of the prediction data
csv_file_pred = r'/content/drive/MyDrive/Webshop_classification/data/dataset2.csv'

# select all html-files for prediction and move them into the corresponding folder
with open(csv_file_pred, 'r', newline='') as f:
  reader = csv.reader(f)
  for index, row in enumerate(reader):
    if index > 0:
      filename = row[0] + '.html'
    
      from_filename = os.path.join(original_dir, filename)
      to_filename = os.path.join(new_dir_pred, filename)
      
      try:
        shutil.copy2(from_filename, to_filename)
        print("Moved - '{}' -> '{}'".format(from_filename, to_filename))
      except shutil.Error as e:
        print("Failed - '{}' -> '{}'".format(from_filename, to_filename))

Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/77records.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/pred/77records.de.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/absperrtechnik24.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/pred/absperrtechnik24.de.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/ackermedia.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/pred/ackermedia.de.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/acris-ecommerce.at.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/pred/acris-ecommerce.at.html'
Moved - '/content/drive/MyDrive/Webshop_classification/data/scraped_html/adepto-shop.de.html' -> '/content/drive/MyDrive/Webshop_classification/data/train_pred_html/pred/adepto-shop.de.html'
Moved - '/content/drive/MyDrive/W

In [None]:
# check the number of files
print('total files for prediction: ', len(os.listdir(new_dir_pred)))

total files for prediction:  200


**Parsing and extraction**

In [None]:
# directory where the files for prediction are stored
directory_pred = r'/content/drive/MyDrive/Webshop_classification/data/train_pred_html/pred/'

In [None]:
# get filenames
fn_results_pred, files_pred = readFilesnamesFromDirectory(directory=directory_pred)
print('There are {} files in the directory'.format(len(fn_results_pred)))

There are 200 files in the directory


In [None]:
# get the titles
html_titles_pred = readHTMLTitlesFromFile(fn_results=fn_results_pred)

['DJ Equipment | DJ Zubehör ★ 77records.de', 'Absperrpfosten, Schilder, Fahrradständer und mehr | absperrtechnik24.de', 'Hosting - Avernis', 'E-Commerce & Shopware Agentur aus Linz OÖ ► ACRIS E-Commerce', 'ADEPTO - Reinigungsfachhandel - ', 'AirPlus International – Ihr Partner für Reisekostenmanagement', '', 'Startseite - Alkomat.net - Alkoholtester und Alkomaten von ACE, Dräger und Envitec sowie Drogentester Onlineshop', '489 Ferienwohnungen & Ferienhäuser im Allgäu', 'Allstars-Vertrieb', 'APM Telescopes', 'Die Werbeagentur in Berlin-Friedenau - april agentur', 'Baender24.de\n', 'bag-trends - sicheres Online Shopping!', 'Bagel Gruppe', 'BAULANDO - Arbeitsschutz, Baumaschinen, Diamantscheiben', 'Grill Shop - BBQ Grills & Grillzubehör kaufen » BBQ24.de', 'Bear Family Records - Vinyl, CD, DVD- Musik Mailorder Online', 'Beauty Hills Shop', 'Handverlesene Kreuzfahrten & Schiffsreisen bei BELLEVUE Kreuzfahrten online buchen', 'Der Berentzen Onlineshop', 'Bergfest | Bergans Shop mit 10% Kund

In [None]:
# check the number of titles
print('The list contains {} titles'.format(len(html_titles_pred)))

The list contains 200 titles


In [None]:
# make each entry a string object
html_titles_pred = [str(s) for s in html_titles_pred]

In [None]:
# get the text content
html_content_pred = readHTMLContentFromFile(fn_results=fn_results_pred)
print(html_content_pred)

['Versandkostenfrei ab 25.- in DE / Tiefpreisgarantie Tel 08241 40 90 10 Fax 08241 40 90 199', 'Sie können sich nach dem Einkauf für die kostenlose Trusted Shops Mitgliedschaft Basic anmelden, inkl. Käuferschutz bis je 100 € für den aktuellen Einkauf sowie für Ihre weiteren Einkäufe in deutschen Shops mit dem Trusted Shops Gütesiegel.', '', 'In Sachen E-Commerce kann uns keiner etwas vormachen! ACRIS ist DER Ansprechpartner für E-Commerce Consulting,\xa0Entwicklung von Shopware Webshops und Online Marketing. Wir sind seit mehr als 16 Jahren als E-Commerce Agentur erfolgreich. Außerdem haben wir zahlreiche Projekte im B2C und B2B Umfeld umgesetzt.', 'Versandkosten', 'Für eine bestmögliche Nutzung verwendet unsere Website Cookies. Wenn Sie fortfahren, gehen wir davon aus, dass Sie der Verwendung von Cookies durch unsere Website zustimmen. Weitere Informationen finden Sie in unserem Datenschutzhinweis. Weiter Datenschutzhinweis', 'Die Domain "zitrom.alfahosting.org" ist nicht verfügbar.',

In [None]:
# check the number of contents
print('The list contains {} descriptions'.format(len(html_content_pred)))

The list contains 200 descriptions


In [None]:
# create a DataFrame containing files, title and text content
df_pred = pd.DataFrame(list(zip(files_pred, html_titles_pred, html_content_pred)), columns = ['Website', 'Title', 'Description'])
df_pred.head(20)

Unnamed: 0,Website,Title,Description
0,77records.de.html,DJ Equipment | DJ Zubehör ★ 77records.de,Versandkostenfrei ab 25.- in DE / Tiefpreisgar...
1,absperrtechnik24.de.html,"Absperrpfosten, Schilder, Fahrradständer und m...",Sie können sich nach dem Einkauf für die koste...
2,ackermedia.de.html,Hosting - Avernis,
3,acris-ecommerce.at.html,E-Commerce & Shopware Agentur aus Linz OÖ ► AC...,In Sachen E-Commerce kann uns keiner etwas vor...
4,adepto-shop.de.html,ADEPTO - Reinigungsfachhandel -,Versandkosten
5,airplus.com.html,AirPlus International – Ihr Partner für Reisek...,Für eine bestmögliche Nutzung verwendet unsere...
6,alfahosting.org.html,,"Die Domain ""zitrom.alfahosting.org"" ist nicht ..."
7,alkomat.net.html,Startseite - Alkomat.net - Alkoholtester und A...,window.onAmazonLoginReady = function () {\n ...
8,allgaeu-travel.com.html,489 Ferienwohnungen & Ferienhäuser im Allgäu,
9,allstars-direktvertrieb.de.html,Allstars-Vertrieb,direkt kaufen


**Building text corpus**

In [None]:
# concatenate titles and descriptions to a single string
df_pred['Text Corpus'] = [html_titles_pred[i] +' ' + html_content_pred[i] for i in range(0, len(html_titles_pred))]

In [None]:
# check the first 5 entries
print(df_pred['Text Corpus'][:5])

0    DJ Equipment | DJ Zubehör ★ 77records.de Versa...
1    Absperrpfosten, Schilder, Fahrradständer und m...
2                                   Hosting - Avernis 
3    E-Commerce & Shopware Agentur aus Linz OÖ ► AC...
4       ADEPTO - Reinigungsfachhandel -  Versandkosten
Name: Text Corpus, dtype: object


**Exporting the DataFrame to csv**

In [None]:
# export the DataFrame to csv
df_pred.to_csv('/content/drive/MyDrive/Webshop_classification/data/pred_dataset.csv', index=False)