# Import libraries & Setup dataframe

In [2]:
import pandas as pd
import matplotlib as plt
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import os


In [8]:
df = pd.read_csv('../data/nlp_features_abstract.csv')

In [9]:
df.columns

Index(['h1', 'abstract', 'meta_title', 'meta_description', 'merged_url',
       'h1_000', 'h1_001', 'h1_02', 'h1_03', 'h1_04',
       ...
       'abstract_übt', 'abstract_übungen', 'abstract_üppig', 'abstract_üppige',
       'abstract_üppigen', 'abstract_üppiger', 'abstract_üppiges',
       'abstract_ür', 'abstract_ško', 'abstract_škoda'],
      dtype='object', length=35911)

In [11]:
data_file = 'discover_2024-03-26.xlsx'
file_path = '../data/' + data_file
file_path

'../data/discover_2024-03-26.xlsx'

In [12]:
df = pd.read_excel(file_path, sheet_name='data')

In [4]:
# keep raw data for later
df_raw = df.copy()

# Preprocessing of the data

In [5]:
# transform columns to lower columns
columns = [col.lower() for col in df.columns]
df.columns = columns

# show results
df.head()

Unnamed: 0,page_efahrer_id,date,published_at,publish_date_equal_to_date,page_canonical_url,page_name,classification_product,classification_type,title,page_author,daily_likes,daily_dislikes,word_count,video_play,impressions,discover_clicks,discover_impressions
0,1010803,2023-01-02,NaT,N,https://efahrer.chip.de/news/tariferhoehungen-...,efa-1010803 | Tariferhöhungen und THG-Prämie: ...,THG,News,Tariferhöhungen und THG-Prämie: Ladesäulenbet...,Karl Lüdecke,,,,1261.0,1375.0,1301.0,20323.0
1,1010592,2023-01-02,NaT,N,https://efahrer.chip.de/news/das-logo-von-alfa...,efa-1010592 | Alfa Romeo: Was bedeuten Schlang...,Auto,News,Alfa Romeo: Was bedeuten Schlange und Kreuz?,Karl Müller,,,,286.0,298.0,164.0,1493.0
2,1010719,2023-01-05,NaT,N,https://efahrer.chip.de/news/titel-ist-zurueck...,efa-1010719 | Rennen um die effizienteste Sola...,Solaranlagen,News,Rennen um die effizienteste Solarzelle: Deuts...,Aslan Berse,,,,156.0,300.0,303.0,4912.0
3,1010727,2023-01-05,NaT,N,https://efahrer.chip.de/news/entlastungen-fuer...,efa-1010727 | Antrag stellen oder leer ausgehe...,Energie,Ratgeber,Antrag stellen oder leer ausgehen: Diese Entl...,CHIP,,,,16.0,55.0,14009.0,92422.0
4,1010557,2023-01-02,2023-01-02,Y,https://efahrer.chip.de/news/solaranlage-auch-...,efa-1010557 | Balkonkraftwerk kaufen: Das sind...,Balkonkraftwerk,Kaufberatung,Balkonkraftwerk kaufen: Das sind die besten M...,Eva Goldschald,17.0,1.0,1513.0,174.0,128.0,6494.0,114984.0


# Getting to know the data

In [6]:
df.describe()

Unnamed: 0,page_efahrer_id,date,published_at,daily_likes,daily_dislikes,word_count,video_play,impressions,discover_clicks,discover_impressions
count,132846.0,132846,42111,33623.0,27291.0,41639.0,132070.0,132070.0,132070.0,132070.0
mean,803389.1,2023-08-11 17:57:35.291089408,2023-11-03 22:30:28.595853568,3.590548,2.693525,665.424986,1441.797108,1836.826244,2735.705,31474.64
min,1037.0,2023-01-01 00:00:00,2019-02-18 00:00:00,-84.0,-59.0,100.0,0.0,0.0,0.0,50.0
25%,1010317.0,2023-05-05 00:00:00,2023-09-06 00:00:00,0.0,0.0,415.0,7.0,20.0,25.0,281.0
50%,1012469.0,2023-08-03 00:00:00,2023-12-13 00:00:00,0.0,0.0,528.0,54.0,107.0,149.0,2148.0
75%,1014952.0,2023-11-23 00:00:00,2024-01-29 00:00:00,1.0,1.0,689.0,379.0,628.0,948.75,12306.75
max,1018782.0,2024-03-23 00:00:00,2024-03-21 00:00:00,2568.0,2629.0,5306.0,703622.0,708360.0,1053606.0,10884350.0
std,383675.9,,,38.784864,42.709102,495.811902,8957.003037,10127.957168,15423.45,158897.4


In [7]:
df.page_efahrer_id.unique().shape

(6899,)

## Getting a better understanding of the features

Is the ID in PAGE_EFAHRER_ID the same as the ID in PAGE_NAME?

In [8]:
# Transform data type for the containment check
df['page_efahrer_id'] = df['page_efahrer_id'].astype('string')
df['page_name'] = df['page_name'].astype('string')

# Function to check whether Page ID is part of page name
def check_containment(row):
    return row['page_efahrer_id'] in row['page_name']

df['containment_check'] = df.apply(check_containment, axis=1)

# Check whether results consist only True values
df['containment_check'].unique()

# Yes, the IDs are always the same

# Drop row and transform data type back
df.drop('containment_check', axis=1, inplace=True)
df['page_efahrer_id'] = df['page_efahrer_id'].astype('int')

# Missing data

In [9]:
df.isna().sum()

page_efahrer_id                    0
date                               0
published_at                   90735
publish_date_equal_to_date         0
page_canonical_url                 0
page_name                          0
classification_product           655
classification_type              655
title                              0
page_author                        0
daily_likes                    99223
daily_dislikes                105555
word_count                     91207
video_play                       776
impressions                      776
discover_clicks                  776
discover_impressions             776
dtype: int64

Publish date "published_at" can be imputed by setting publish date to first date of occurrence in the data set.
Classification Type "classification_type" can be imputed by extracting it from URL.


# TBD IMPUTING OF MISSING DATA

# Scrape data (tittles, actual date)

In [None]:
# url = 'https://efahrer.chip.de/news/engea-im-schnellcheck-wirklich-deutschlands-komfortabelste-wallbox-loesung_1011849'

# html = requests.get(url)

# soup = BeautifulSoup(html.text, 'html.parser')

In [45]:
i=0
for row_idx in scraping.index:
    url = scraping.loc[row_idx, 'page_canonical_url']
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')

    ### meta data ###
    # meta title, displayed in search results
    scraping.loc[row_idx, 'title'] = soup.find('title').text
    # meta description, displayed in search results
    scraping.loc[row_idx, 'meta_description'] = soup.find('meta', attrs={'name': 'description'}).get('content')
    # meta image, primarily for social media, but might be displayed in search results
    scraping.loc[row_idx, 'meta_image_url'] = soup.find('meta', attrs={'property': 'og:image'}).get('content')

    # media type for first big image/video on page
    h4_element = soup.find('h4', class_='mt-0 credentials open-sans-regular')  # Find the h4 element with specific class
    if h4_element:
        next_div = h4_element.find_next('div')  # Find the next div element after the h4
        scraping.loc[row_idx, 'media_type'] = next_div.get('class') if next_div else None  # Get the class attribute of the next div
        if 'img-wrapper' in media_type:
            scraping.loc[row_idx, 'media_type'] = 'img'
        elif 'mb-3 video-player recobar' in media_type:
            scraping.loc[row_idx, 'media_type'] = 'video'
        else:
            scraping.loc[row_idx, 'media_type'] = media_type
            # VIDEO = {'class'='mb-3 video-player recobar'}
            # IMAGE = {'class'='img-wrapper'}

    # image size
    if scraping.loc[row_idx, 'media_type'] == 'img':
    #    page_img_url = soup.find(id='content').find('article').find('div', {'class':'img-wrapper'}).find('img').get('src').text
        scraping.loc[row_idx, 'page_img_size'] = soup.find(id='content').find('article').find('div', {'class':'img-wrapper'}).find('img').get('sizes')


    ### user-visible data ###
    # first headline on the article page
    scraping.loc[row_idx, 'h1'] = soup.find('h1').text
    # author displayed on the article page
    scraping.loc[row_idx, 'author'] = soup.find(id='content').find('article').find('h4').find('a').text
    # date displayed on the article page
    scraping.loc[row_idx, 'date'] = soup.find(id='content').find('article').find('h4').find('span').text
    # abstract, first text paragraph of the article
    scraping.loc[row_idx, 'abstract'] = soup.find(id='content').find('article').find('p').text

    #h4 = soup.find_all('h4', {'class': 'mt-0 credentials open-sans-regular'})
    #img = soup.find("article", {"class": "single-article"}).find('div', {'class': 'img-wrapper'}).find("div", {'class': 'caption'}).find("span", {"class": "p img-title"}).text

    i+=1
    if i==10: 
        scraping.to_csv('../data/temp_scraped.csv')
        i=0

In [56]:
# Path to the folder containing HTML files
folder_path = '../data/pages'

# Initialize a list to hold the scraped data
scraped_data = []

# Iterate over HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.html'):  # Check if the file is an HTML file
        file_path = os.path.join(folder_path, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
                soup = BeautifulSoup(html_content, 'html.parser')

                # Extract meta data
                url = soup.find('link', {'rel':'canonical'}).get('href')
                meta_title = soup.find('title').text
                meta_description = soup.find('meta', attrs={'name': 'description'}).get('content')
                meta_image_url = soup.find('meta', attrs={'property': 'og:image'}).get('content')

                # Extract media type
                media_type_element = soup.find('h4', class_='mt-0 credentials open-sans-regular').find_next('div')
                media_type_class = media_type_element.get('class') if media_type_element else None
                media_type = 'img' if 'img-wrapper' in media_type_class else 'video' if 'mb-3 video-player recobar' in media_type_class else None

                # Extract image size
                page_img_size = None
                if media_type == 'img':
                    page_img_size = soup.find(id='content').find('article').find('div', {'class':'img-wrapper'}).find('img').get('sizes')

                # Extract user-visible data
                h1 = soup.find('h1').text
                author = soup.find(id='content').find('article').find('h4').find('a').text
                date = soup.find(id='content').find('article').find('h4').find('span').text
                abstract = soup.find(id='content').find('article').find('p').text

                # Append scraped data to the list
                scraped_data.append({
                    'filename': filename,                     
                    'url': url,
                    'meta_title': meta_title,
                    'meta_description': meta_description,
                    'meta_image_url': meta_image_url,
                    'media_type': media_type,
                    'page_img_size': page_img_size,
                    'h1': h1,
                    'author': author,
                    'date': date,
                    'abstract': abstract
                })

        except Exception as e:
            print(f"Error processing file: {file_path}, {e}")

# Convert the list of dictionaries to a DataFrame
scraped_df = pd.DataFrame(scraped_data)

# Write the DataFrame to a CSV file
scraped_df.to_csv('../data/temp_scraped.csv', index=False)

Error processing file: ../data/pages/index.html, 'NoneType' object has no attribute 'get'


In [83]:
# processing of the scraped data
scraped_df['page_ID'] = scraped_df['filename'].apply(lambda x: x.split('.')[0])
scraped_df.drop('filename', axis=1, inplace=True)

scraped_df['meta_title'] = scraped_df['meta_title'].apply(lambda x: x.rsplit('-', 1)[0])

# Apply strip() method to remove leading and trailing whitespaces from all string columns
scraped_df = scraped_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

#scraped_df['date'] = pd.to_datetime(scraped_df['date'], format='%d. %B %Y')
scraped_df['date'] = pd.to_datetime(scraped_df['date'], errors='coerce')

scraped_df['page_img_size'] = scraped_df['page_img_size'].apply(lambda x: x.split(',')[0] if x else None)
scraped_df['page_img_size'] = scraped_df['page_img_size'].apply(lambda x: x.split(')')[-1] if x else None)

KeyError: 'filename'

In [55]:
# unique ids
scraping = df[["page_efahrer_id", "page_canonical_url"]].drop_duplicates(subset = ["page_efahrer_id"])
scraping

Unnamed: 0,page_efahrer_id,page_canonical_url
0,1010803,https://efahrer.chip.de/news/tariferhoehungen-...
1,1010592,https://efahrer.chip.de/news/das-logo-von-alfa...
2,1010719,https://efahrer.chip.de/news/titel-ist-zurueck...
3,1010727,https://efahrer.chip.de/news/entlastungen-fuer...
4,1010557,https://efahrer.chip.de/news/solaranlage-auch-...
...,...,...
130737,1016319,https://efahrer.chip.de/news/elektro-beliebter...
131077,1010895,https://efahrer.chip.de/news/nie-wieder-kabels...
131188,1018743,https://efahrer.chip.de/news/irren-pfusch-am-e...
131479,1017718,https://efahrer.chip.de/news/alle-59-meter-ein...


In [56]:
# creating the dummy columns that will be populated with the scraped data
scraping['H1'] = 'dummy'
scraping['last_update_date'] = 'today'
scraping['abstract'] = 'null'

scraping = scraping.set_index('page_efahrer_id')

scraping

Unnamed: 0_level_0,page_canonical_url,H1,last_update_date,abstract
page_efahrer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1010803,https://efahrer.chip.de/news/tariferhoehungen-...,dummy,today,
1010592,https://efahrer.chip.de/news/das-logo-von-alfa...,dummy,today,
1010719,https://efahrer.chip.de/news/titel-ist-zurueck...,dummy,today,
1010727,https://efahrer.chip.de/news/entlastungen-fuer...,dummy,today,
1010557,https://efahrer.chip.de/news/solaranlage-auch-...,dummy,today,
...,...,...,...,...
1016319,https://efahrer.chip.de/news/elektro-beliebter...,dummy,today,
1010895,https://efahrer.chip.de/news/nie-wieder-kabels...,dummy,today,
1018743,https://efahrer.chip.de/news/irren-pfusch-am-e...,dummy,today,
1017718,https://efahrer.chip.de/news/alle-59-meter-ein...,dummy,today,


In [58]:
## Do actual scraping
i=0
for row_idx in scraping.index:
    url = scraping.loc[row_idx, 'page_canonical_url']
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')

    scraping.loc[row_idx, 'H1'] = soup.find('h1').text
    scraping.loc[row_idx, 'last_update_date'] = soup.find('h4').find('span').text
    scraping.loc[row_idx, 'abstract'] = soup.find("article", {"class": "single-article"}).find('p').text

    scraping.loc[row_idx, 'abstract'] = soup.find("article", {"class": "single-article"}).find('p').text


    i+=1
    if i==10: 
        scraping.to_csv('../data/temp_scraped.csv')
        i=0

AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
# Extracting the date
# Find the span element within the h4 tag
date_span = soup.find('h4').find('span')

# Extract the text containing the date
date_text = date_span.text

abstract_p = soup.find('p')

# Extract the text containing the date
abstract_text = abstract_p.text

# Cleaning to be done

1. getting only the last part of the URL
2. 

In [None]:
data_file = 'discover_2024-03-26.xlsx'
file_path = '../data/' + data_file
file_path

'../data/discover_2024-03-26.xlsx'

In [None]:
df = pd.read_excel(file_path, sheet_name='data')

# To be sorted/ organized

In [None]:
df_pub = df.query("publish_date_equal_to_date == 'Y'")

In [None]:
df.page_efahrer_id.unique()

In [None]:
df_pub.page_efahrer_id.unique()

In [None]:
print(df.daily_likes.isna().sum())
print(df.daily_likes.notna().sum())

In [None]:
df.groupby("page_efahrer_id").daily_likes.head()


In [None]:
df.head()

In [None]:
import matplotlib as plt
import seaborn as sns

In [None]:
daily_likes = df.daily_likes.
daily_likes
#sns.histplot(df.daily_likes.unique())

In [None]:
non_null_likes = df['daily_likes'].dropna()
non_null_likes.unique().min()
#sns.histplot(non_null_likes)

In [None]:
df.page_efahrer_id.unique().shape

In [None]:
df.drop_duplicates("page_efahrer_id", "date", "DISCOVER_IMPRESSIONS")

In [None]:
df.groupby("page_efahrer_id").count()

In [None]:
df[["PAGE_EFAHRER_ID", "PAGE_NAME"]].drop_duplicates().groupby("PAGE_NAME").count().max()