# Web-Scraping: Playground

In [472]:
import os
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

## Part 1: Manually Scraping LinkedIn

In [252]:
STORAGE_DIR = "jobportals"

In [10]:
url = "https://www.linkedin.com/"

In [12]:
response = requests.get(url, allow_redirects=True)

----

**Note:**

Die Funktion requests.get(url) stellt eine HTTP-Verbindung auf, womit sich die folgenden Informationen erfragen lassen:

- Status-Code: `response.status_code`
- Body: `response.content`
- Encoding: `repsonse.encoding`
- Header: `response.header`
- JSON Inhalt (falls API vorhanden): `response.json()`
- Finale URL nach Redirects: `response.redirect`
- Cookies: `response.cookies`

----

In [23]:
response.encoding

'utf-8'

In [28]:
response.status_code

200

In [32]:
response.url

'https://www.linkedin.com/'

In [52]:
print(response.content[0:700], "[...]")

b'<!DOCTYPE html>\n\n\n    \n    \n    \n    \n    \n    \n    \n    \n\n    \n    \n    \n    \n\n    \n    <html lang="en">\n      <head>\n        <meta name="pageKey" content="d_homepage-guest-home">\n<!----><!----><!---->        <meta name="locale" content="en_US">\n        <meta id="config" data-app-version="2.1.2128" data-call-tree-id="AAY0Twsjl++ggmfwzBRfmg==" data-jet-tags="guest-homepage" data-multiproduct-name="homepage-guest-frontend" data-service-name="homepage-guest-frontend" data-browser-id="b038e120-2455-48de-89c9-476fe184286d" data-enable-page-view-heartbeat-tracking data-page-instance="urn:li:page:d_homepage-guest-home;3FZ5HyMDTY+i/C6NdifM1A==" data-disable-jsbeacon-pagekey-suffix="false" data-memb' [...]


In [14]:
print(response.text[0:700], "[...]")

<!DOCTYPE html>


    
    
    
    
    
    
    
    

    
    
    
    

    
    <html lang="en">
      <head>
        <meta name="pageKey" content="d_homepage-guest-home">
<!----><!----><!---->        <meta name="locale" content="en_US">
        <meta id="config" data-app-version="2.1.2136" data-call-tree-id="AAY022MT8Sy/4JJBMeGMrQ==" data-jet-tags="guest-homepage" data-multiproduct-name="homepage-guest-frontend" data-service-name="homepage-guest-frontend" data-browser-id="fb9572a4-80c5-4062-8e81-5b6da6fb1ae5" data-enable-page-view-heartbeat-tracking data-page-instance="urn:li:page:d_homepage-guest-home;2AXMjT8BR7W279Q7lkuS2g==" data-disable-jsbeacon-pagekey-suffix="false" data-memb [...]


Die extrahierten Daten in einer HTML-Datei speichern

In [97]:
with open(os.path.join(STORAGE_DIR, "lkn.html"), "wb") as f:
    f.write(response.content)

## Part 2: Automatically Scraping Multiple Job Portals

Einen Dictionary mit der Funktion dict() erstellen.

In [350]:
multiple_jp = dict(
    stps= "https://www.stepstone.de",
    idd= "https://de.indeed.com",
    lkn= "https://www.linkedin.com/jobs",
    mst= "https://www.monster.de",
    abg= "https://jobboerse.arbeitsagentur.de"
)

In [77]:
multiple_jp

{'stps': 'https://www.stepstone.de',
 'idd': 'https://de.indeed.com',
 'lkn': 'https://www.linkedin.com/jobs',
 'mst': 'https://www.monster.de',
 'abg': 'https://jobboerse.arbeitsagentur.de'}

In [157]:
# List of news pages to be scraped
newspaper_urls = dict(
    sz="https://www.sueddeutsche.de/",
    zeit="https://www.zeit.de/index",
    faz="https://www.faz.net/aktuell/",
    ts="https://www.tagesspiegel.de/",
    spiegel="https://www.spiegel.de/",
    kronen="https://www.krone.at/",
    wtf="https://asdfkajwlkejwkejklajsdflksadjfasdf.nix",
)

Aktuelles Datum als String transformieren

In [352]:
now = datetime.now()
str_now = now.strftime("%Y-%m-%d")
print(str_now)

2025-05-11


## Target objects

We will create two objects:

- `content_dict`: a dict with the HTML content of the pages we scraped
- `log_list`: a list with metadata about our requests

In [354]:
content_dict = {}
text_dict = {}
log_list = []
failed_list = []

In [356]:
def scrape_website(name, url):
    #HTTP-Verbindung herstellen
    response = requests.get(url, allow_redirects=True)
    content = response.content
    text = response.text

    # (2) File name to store the raw HTML
    file_name = os.path.join(
        STORAGE_DIR,f"{str_now}-{name}.html")
    
    # (3) Write raw HTML
    with open(file_name, "wb") as f:
        f.write(response.content)

    # (4) Fill content_dict and text_dict
    content_dict[name] = response.content
    text_dict[name] = response.text

    # (5) Fill log_list
    log_info = dict(
        name=name,
        file_name=file_name,
        date=str_now,
        status_code=response.status_code,
        url=response.url,
        encoding=response.encoding,
    )
    log_list.append(log_info)


In [358]:
for name, url in multiple_jp.items():
    try:
        scrape_website(name, url)
        print("Die URL wurde erfolgreich abgerufen \n%s" % (url))
    except:
        failed_list.append((name,url))
        print("Erfolgreich abgerufen \n%s /n%s" % (name, url))

Erfolgreich abgerufen 
stps /nhttps://www.stepstone.de
Die URL wurde erfolgreich abgerufen 
https://de.indeed.com
Die URL wurde erfolgreich abgerufen 
https://www.linkedin.com/jobs
Die URL wurde erfolgreich abgerufen 
https://www.monster.de
Die URL wurde erfolgreich abgerufen 
https://jobboerse.arbeitsagentur.de


In [360]:
log_df = pd.DataFrame(log_list)

In [362]:
log_df

Unnamed: 0,name,file_name,date,status_code,url,encoding
0,idd,jobportals/2025-05-11-idd.html,2025-05-11,403,https://de.indeed.com/,UTF-8
1,lkn,jobportals/2025-05-11-lkn.html,2025-05-11,200,https://www.linkedin.com/jobs,utf-8
2,mst,jobportals/2025-05-11-mst.html,2025-05-11,200,https://www.monster.de/,ISO-8859-1
3,abg,jobportals/2025-05-11-abg.html,2025-05-11,200,https://www.arbeitsagentur.de/jobsuche/willkommen,UTF-8


In [207]:
df_failed_list = pd.DataFrame(failed_list)

In [209]:
df_failed_list

Unnamed: 0,0,1
0,stps,https://www.stepstone.de
1,idd,https://de.indeed.com
2,lkn,https://www.linkedin.com/jobs
3,mst,https://www.monster.de
4,abg,https://jobboerse.arbeitsagentur.de


## T: Word count

### HTML File erstellen und den Content extrahieren

In [377]:
idd_url = "https://de.indeed.com"

In [380]:
idd_response = requests.get(idd_url, allow_redirects=True)

In [386]:
with open(os.path.join(STORAGE_DIR, "idd.html"), "wb") as f:
    idd_content = f.write(idd_response.content)

### HTML File einlesen

In [485]:
idd_response.encoding

'UTF-8'

In [408]:
idd_html_file = os.path.join("jobportals", "idd.html")

In [649]:
 with open(idd_html_file, "r", encoding="utf-8") as f:
     idd_text = f.read()

In [482]:
print(idd_text[0:700], "[...]")

<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><title>Blocked - Indeed.com</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>:root{color-scheme:light dark;--background-color:#fff;--primary-1000:#0d2d5e;--primary-900:#164081;--primary-800:#2557a7;--primary-700:#3f73d3;--primary-600:#6792f0;--neutral-1000:#2d2d2d;--neutral-900:#424242;--neutral-400:#d4d2d0;--dark-1000:#040606;--link-color:var(--primary-800);--link-color-hover:var(--primary-900);--menu-background-color:#fff;--text-color:var(--neutral-1000);--text-color-hover:var(--neutral-900);--default-transition:cubic-bezier(.645,.045,.355,1);--menu-transition:.28s all .12s ease-out;--font-family:"N [...]


In [651]:
soup = BeautifulSoup(idd_text, "html.parser")

In [653]:
idd_text = soup.text

In [494]:
idd_text[0:700]

'Blocked - Indeed.com\n        Find jobs   Company reviews   Find salaries    Sign in       Upload your resume   Sign in   Employers / Post Job   Find jobs   Company reviews   Find salaries      Request Blocked You have been blocked. If you believe this in error, please go to support.indeed.com and reference the following information: Your Ray ID for this request is 93f3b1f459021953 Your current IP for this request is 104.28.193.94 Return home \xa0 → Troubleshooting Cloudflare Errors Need more help? Contact us  '

In [500]:
idd_text = idd_text.replace("\n", " ")

In [502]:
idd_text[0:700]

'Blocked - Indeed.com         Find jobs   Company reviews   Find salaries    Sign in       Upload your resume   Sign in   Employers / Post Job   Find jobs   Company reviews   Find salaries      Request Blocked You have been blocked. If you believe this in error, please go to support.indeed.com and reference the following information: Your Ray ID for this request is 93f3b1f459021953 Your current IP for this request is 104.28.193.94 Return home \xa0 → Troubleshooting Cloudflare Errors Need more help? Contact us  '

In [506]:
idd_text = idd_text.lower()

In [508]:
idd_text

'blocked - indeed.com         find jobs   company reviews   find salaries    sign in       upload your resume   sign in   employers / post job   find jobs   company reviews   find salaries      request blocked you have been blocked. if you believe this in error, please go to support.indeed.com and reference the following information: your ray id for this request is 93f3b1f459021953 your current ip for this request is 104.28.193.94 return home \xa0 → troubleshooting cloudflare errors need more help? contact us  '

In [512]:
items = idd_text.split(" ")

In [519]:
items[0:20]

['blocked',
 '-',
 'indeed.com',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'find',
 'jobs',
 '',
 '',
 'company',
 'reviews',
 '',
 '',
 'find']

In [529]:
items = [i for i in items if len(i) > 1]
items[0:20]

['blocked',
 'indeed.com',
 'find',
 'jobs',
 'company',
 'reviews',
 'find',
 'salaries',
 'sign',
 'in',
 'upload',
 'your',
 'resume',
 'sign',
 'in',
 'employers',
 'post',
 'job',
 'find',
 'jobs']

In [541]:
pd.value_counts(items)[0:20]

  pd.value_counts(items)[0:20]
  pd.value_counts(items)[0:20]


find                4
request             3
in                  3
this                3
your                3
for                 2
is                  2
you                 2
blocked             2
jobs                2
company             2
salaries            2
reviews             2
sign                2
contact             1
104.28.193.94       1
ray                 1
id                  1
93f3b1f459021953    1
current             1
Name: count, dtype: int64

### Import stop words

In [546]:
stopwords_url = "https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt"

In [579]:
stopword = requests.get(stopwords_url, allow_redirects=True)
stopword_text = stopword.text
stopword_text[0:700]

'; GERMAN STOPWORDS\n; Zusammmengetragen von Marco Götze, Steffen Geyer\n; LAST UPDATE 12/2016\n; Web Stopwords, more information at Source Link below!\n; www.solariz.de\n; Source and more Information: https://solariz.de/de/downloads/6/german-enhanced-stopwords.htm\n; ####\n; Link-Ware; If you use this List somehow please give me a Link to URL mentioned above! Thanks\n; ####\nab\naber\nabermaliges\nabermals\nabgerufen\nabgerufene\nabgerufener\nabgerufenes\nabgesehen\nacht\naehnlich\naehnliche\naehnlichem\naehnlichen\naehnlicher\naehnliches\naehnlichste\naehnlichstem\naehnlichsten\naehnlichster\naehnlichstes\naeusserst\naeusserste\naeusserstem\naeussersten\naeusserster\naeusserstes\nähnlich\nähnliche\nähnlichem\nähnlichen\nähnlicher'

In [596]:
stopword_list = stopword.text.split("\n")[9:]
stopword_list[0:20]

['ab',
 'aber',
 'abermaliges',
 'abermals',
 'abgerufen',
 'abgerufene',
 'abgerufener',
 'abgerufenes',
 'abgesehen',
 'acht',
 'aehnlich',
 'aehnliche',
 'aehnlichem',
 'aehnlichen',
 'aehnlicher',
 'aehnliches',
 'aehnlichste',
 'aehnlichstem',
 'aehnlichsten',
 'aehnlichster']

In [604]:
items = [i for i in items if i not in stopword_list]
items[:20]

['blocked',
 'indeed.com',
 'find',
 'jobs',
 'company',
 'reviews',
 'find',
 'salaries',
 'sign',
 'upload',
 'your',
 'resume',
 'sign',
 'employers',
 'post',
 'job',
 'find',
 'jobs',
 'company',
 'reviews']

### Kurzfassung

In [633]:
stopwords_url = "https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt"
stopword_list = requests.get(stopwords_url, allow_redirects=True).text.split("\n")[9:]
stopword_list[0:20]

['ab',
 'aber',
 'abermaliges',
 'abermals',
 'abgerufen',
 'abgerufene',
 'abgerufener',
 'abgerufenes',
 'abgesehen',
 'acht',
 'aehnlich',
 'aehnliche',
 'aehnlichem',
 'aehnlichen',
 'aehnlicher',
 'aehnliches',
 'aehnlichste',
 'aehnlichstem',
 'aehnlichsten',
 'aehnlichster']

In [699]:
def process_html(text):
    items = text.replace("\n", " ").lower().split(" ")
    items = [i for i in items if len(i) > 1 and i not in stopword_list]
    return items

In [701]:
 with open(idd_html_file, "r", encoding="utf-8") as f:
     idd_text = f.read()

In [703]:
soup = BeautifulSoup(idd_text, "html.parser")

In [705]:
idd_text = soup.text

In [709]:
pd.value_counts(process_html(idd_text))[0:20]

  pd.value_counts(process_html(idd_text))[0:20]
  pd.value_counts(process_html(idd_text))[0:20]


find                4
request             3
this                3
your                3
sign                2
is                  2
you                 2
blocked             2
jobs                2
reviews             2
company             2
salaries            2
contact             1
return              1
id                  1
93f3b1f459021953    1
current             1
ip                  1
104.28.193.94       1
home                1
Name: count, dtype: int64