# Imported Libraries

In [23]:
import camelot
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium import webdriver

## 1) Extracting Sparkassen Info from PDF

Used the following function to fix the issue of having all data in one cell

In [224]:
# https://stackoverflow.com/questions/45846765/efficient-way-to-unnest-explode-multiple-list-columns-in-a-pandas-dataframe

def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [244]:
tables = camelot.read_pdf('Sparkassenrangliste_2018.pdf',pages='all')

# Creating new df to append to
df = pd.DataFrame()

# Iterate over each table in the 10 pages
for table in tables:
    table = table.df
    table.columns = ['Rang','Institut','Sitz','Verband','Bilanzsumme gemäß Bilanzstatistik','Kundeneinlagen','Spareinlagen', 'Kundenkredite','Anzahl Mitarbeiter','Sparkassenstellen (einschließlich SB)']
    table.drop(index=[0,1,2],inplace=True)
    table = table.apply(lambda text: text.str.split('\n'))
    table.reset_index(inplace=True)
    table.drop(columns='index',inplace=True)
    table = explode(table,list(table.columns))
    df = df.append(table)

df.set_index('Rang',inplace=True)

# Get rid of '.' to convert to numeric
df[list(df.columns)[3:]] = df[list(df.columns)[3:]].apply(lambda number: number.str.replace('.',''))

# Convert colukns into integers
for column in list(df.columns)[3:]:
    df[column] = pd.to_numeric(df[column])

# Remove any potential whitespace to have clean text
for column in list(df.columns)[:3]:
    df[column] = df[column].str.strip()
    
# Save df as csv for later use in the scarper
df.to_csv('sparkassen_rangliste_cleaned.csv')

# 2) Getting for each Sparkassen Name the website URL with Selenium 

In [None]:
df = pd.read_csv('sparkassen_rangliste_cleaned.csv')

sparkaseen_links = []

# Setting driver to Safari
driver = webdriver.Chrome(
    executable_path='/Users/felixvemmer/OneDrive/Dokumente/Hobbies/Programming/Python/chromedriver')

# Extracting sparkassen from df and constructing search url
sparkassen = df['Institut'].to_list()
google_search_url = 'https://www.google.de/search?q='

for sparkasse in sparkassen:
    google_url = google_search_url+sparkasse.replace(' ','+')
    driver.get(google_url)
    driver.find_element_by_css_selector('h3').click() 
    target_url = driver.current_url
    sparkaseen_links.append(target_url)
    
df['links'] = sparkaseen_links

# Correcting one record manually
df.at['122','links'] = 'https://www.sparkasse-hegau-bodensee.de/de/home.html'

# Save df as csv for later use in the scarper
df.to_csv('sparkassen_rangliste_cleaned.csv')

## 3) Use Scrapy to test and generate different patterns for target url

## 4) Join verified urls to existing dataframe for final scraping

In [139]:
sparkassen = pd.read_csv('sparkassen_rangliste_cleaned.csv')
verified_urls = pd.read_csv('/Users/felixvemmer/Desktop/sparkassen_accounts/verified_links.csv')

In [140]:
sparkassen['domain'] = sparkassen['links'].str.split('/',expand=True)[2].str.split('www.',expand=True)[1]
verified_urls['domain'] = verified_urls['verified_url'].str.split('/', expand=True)[2].str.split('www.', expand=True)[1]

In [150]:
sparkassen

Unnamed: 0,Rang,Institut,Sitz,Verband,Bilanzsumme gemäß Bilanzstatistik,Kundeneinlagen,Spareinlagen,Kundenkredite,Anzahl Mitarbeiter,Sparkassenstellen (einschließlich SB),links,domain
0,1,Hamburger Sparkasse,Hamburg,HSGV,45850582,34447018,8979342,33106755,5186,194,https://www.haspa.de/de/home.html,haspa.de
1,2,Sparkasse KölnBonn,Köln,RSGV,26756221,20341310,5303201,18519614,3922,136,https://www.sparkasse-koelnbonn.de/de/home.html,sparkasse-koelnbonn.de
2,3,Kreissparkasse Köln,Köln,RSGV,26163554,19389649,5745073,19916338,3969,176,https://www.ksk-koeln.de/de/home.html,ksk-koeln.de
3,4,Frankfurter Sparkasse,Frankfurt Main,SGVHT,19146177,16629605,1553458,8372878,1709,84,https://www.frankfurter-sparkasse.de/de/home.html,frankfurter-sparkasse.de
4,5,Stadtsparkasse München,München,SVB,19111868,15680349,4242820,13912104,2663,106,https://www.sskm.de/de/home.html,sskm.de
...,...,...,...,...,...,...,...,...,...,...,...,...
380,381,Sparkasse Battenberg,Battenberg Eder,SGVHT,218688,174253,83651,158904,50,6,https://www.sparkasse-battenberg.de/de/home.html,sparkasse-battenberg.de
381,382,Stadtsparkasse Schwalmstadt,Schwalmstadt,SGVHT,218582,175022,43371,131275,47,3,https://www.stadtsparkasse-schwalmstadt.de/de/...,stadtsparkasse-schwalmstadt.de
382,383,Stadtsparkasse Grebenstein,Grebenstein,SGVHT,217372,180422,86820,137989,52,4,https://www.stadtsparkasse-grebenstein.de/de/h...,stadtsparkasse-grebenstein.de
383,384,Stadtsparkasse Borken (Hessen),Borken,SGVHT,178546,129399,44868,117875,46,1,https://www.sskborken.de/de/home.html,sskborken.de


In [156]:
final = pd.merge(sparkassen, verified_urls, how='left').drop_duplicates()

In [164]:
# I was not able to catch all patterns so some links were manually set
final[final['verified_url'].isnull()]['links']

14     https://www.sparkasse-nuernberg.de/en/home.html
44           https://www.sparkasse-ulm.de/en/home.html
89       https://www.sparkasse-bamberg.de/en/home.html
169                  https://www.spktw.de/en/home.html
Name: links, dtype: object

In [168]:
final.at[14,'verified_url'] = 'https://www.sparkasse-nuernberg.de/de/home/privatkunden/girokonto.html'
final.at[44,'verified_url'] = 'https://www.sparkasse-ulm.de/de/home/privatkunden/girokonto.html?n=true&stref=hnav'
final.at[89,'verified_url'] = 'https://www.sparkasse-bamberg.de/de/home/privatkunden/girokonto.html?n=true&stref=hnav'
final.at[169,'verified_url'] = 'https://www.spktw.de/de/home/privatkunden/girokonto.html?n=true&stref=hnav'
final.at[169,'verified_url'] = 'https://www.sparkasse-mittelsachsen.de/de/home/privatkunden/konten-und-karten/girokonto2.html?n=true'

In [169]:
final[final['verified_url'].isnull()]['links']

Series([], Name: links, dtype: object)

In [172]:
final.rename(columns={'links':'website_url', 'verified_url': 'account_info_url'},inplace=True)

In [174]:
final.to_csv('final_to_scrape.csv')

In [175]:
final

Unnamed: 0,Rang,Institut,Sitz,Verband,Bilanzsumme gemäß Bilanzstatistik,Kundeneinlagen,Spareinlagen,Kundenkredite,Anzahl Mitarbeiter,Sparkassenstellen (einschließlich SB),website_url,domain,account_info_url
0,1,Hamburger Sparkasse,Hamburg,HSGV,45850582,34447018,8979342,33106755,5186,194,https://www.haspa.de/de/home.html,haspa.de,https://www.haspa.de/de/home/privatkunden/giro...
1,2,Sparkasse KölnBonn,Köln,RSGV,26756221,20341310,5303201,18519614,3922,136,https://www.sparkasse-koelnbonn.de/de/home.html,sparkasse-koelnbonn.de,https://www.sparkasse-koelnbonn.de/de/home/pri...
2,3,Kreissparkasse Köln,Köln,RSGV,26163554,19389649,5745073,19916338,3969,176,https://www.ksk-koeln.de/de/home.html,ksk-koeln.de,https://www.ksk-koeln.de/de/home/privatkunden/...
3,4,Frankfurter Sparkasse,Frankfurt Main,SGVHT,19146177,16629605,1553458,8372878,1709,84,https://www.frankfurter-sparkasse.de/de/home.html,frankfurter-sparkasse.de,https://www.frankfurter-sparkasse.de/de/home/p...
4,5,Stadtsparkasse München,München,SVB,19111868,15680349,4242820,13912104,2663,106,https://www.sskm.de/de/home.html,sskm.de,https://www.sskm.de/de/home/produkte/konten.ht...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,381,Sparkasse Battenberg,Battenberg Eder,SGVHT,218688,174253,83651,158904,50,6,https://www.sparkasse-battenberg.de/de/home.html,sparkasse-battenberg.de,https://www.sparkasse-battenberg.de/de/home/pr...
389,382,Stadtsparkasse Schwalmstadt,Schwalmstadt,SGVHT,218582,175022,43371,131275,47,3,https://www.stadtsparkasse-schwalmstadt.de/de/...,stadtsparkasse-schwalmstadt.de,https://www.stadtsparkasse-schwalmstadt.de/de/...
390,383,Stadtsparkasse Grebenstein,Grebenstein,SGVHT,217372,180422,86820,137989,52,4,https://www.stadtsparkasse-grebenstein.de/de/h...,stadtsparkasse-grebenstein.de,https://www.stadtsparkasse-grebenstein.de/de/h...
391,384,Stadtsparkasse Borken (Hessen),Borken,SGVHT,178546,129399,44868,117875,46,1,https://www.sskborken.de/de/home.html,sskborken.de,https://www.sskborken.de/de/home/privatkunden/...


# 5) Verify table structure and identify patterns

In [197]:
df = pd.read_csv('/Users/felixvemmer/Desktop/sparkassen_accounts/scraping_test.csv')

In [199]:
df[df['tables_on_website'] == 0]['requested_url']

6      https://www.sparkasse-hannover.de/de/home/priv...
103    https://www.ksk-tut.de/de/home/privatkunden/gi...
107    https://www.sparkasse-ffb.de/de/home/privatkun...
120    https://www.sk-westerwald-sieg.de/de/home/priv...
131    https://www.sparkasse-mittelsachsen.de/de/home...
389    https://www.lzo.com/de/home/privatkunden/girok...
Name: requested_url, dtype: object

Unnamed: 0,requested_url,tables_on_website
0,https://www.haspa.de/de/home/privatkunden/giro...,0
1,https://www.sparkasse-koelnbonn.de/de/home/pri...,6
2,https://www.sskm.de/de/home/produkte/konten.ht...,6
3,https://www.frankfurter-sparkasse.de/de/home/p...,9
4,https://www.sparkasse-hannover.de/de/home/priv...,0
...,...,...
388,https://www.sparkasse-geseke.de/de/home/privat...,4
389,https://www.stadtsparkasse-haltern.de/de/home/...,7
390,https://www.stadtsparkasse-burgdorf.de/de/home...,3
391,https://www.ssk-bad-sachsa.de/de/home/privatku...,5
