# NerdWallet - Web scraping

In [39]:
%pip install -r requirements.txt

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import requests, time
from bs4 import BeautifulSoup
from urllib.error import HTTPError
import pandas as pd

In [4]:
base_url = 'https://www.nerdwallet.com/the-best-credit-cards'

In [5]:
def get_cards_info(response, t):
    page = BeautifulSoup(response.text, 'html.parser')
    cards = page.findAll('a', {'class': 'MuiTypography-root MuiTypography-headline MuiLink-root MuiLink-underlineHover css-7wcx0j'})
    cards_info = page.findAll('div', {'class': 'MuiBox-root css-q5fqw0'})
    cards_meta_ls = []
    card_meta = {}
    rank = 1
    for card in cards:
        card_meta['card name'] = card.text
        card_meta['ref link'] = card['href']
        card_meta['category'] = t
        card_meta['rank in category'] = rank
        rank += 1
        cards_meta_ls.append(card_meta.copy())
    cards_ls = []
    card_info = {}
    for c in cards_info:
        info_key = c.findAll('p', {'class': 'MuiTypography-root MuiTypography-body1 css-19gg6ql'})
        info_value = c.findAll('div', {'class': 'MuiBox-root css-osq69c'})
        for key, value in zip(info_key, info_value):
            if value.find('span', {'class': 'MuiBox-root css-1baulvz'}):
                detail = value.find('span', {'class': 'MuiBox-root css-1baulvz'})
                card_info[key.text] = value.text + "(" + detail.span["aria-label"] + ")"
            else:
                card_info[key.text] = value.text
        cards_ls.append(card_info.copy())
    cards_meta_pd = pd.DataFrame(cards_meta_ls)
    cards_pd = pd.DataFrame(cards_ls)
    result_pd = pd.concat([cards_meta_pd, cards_pd], axis=1)
    return result_pd

In [6]:
tabs = ["monthly-best", "travel", "balance-transfer", "low-interest", "cash-back", "rewards", "building-credit", "student", "fair-credit", "business"]
result = pd.DataFrame({})
for t in tabs:
    url = f"{base_url}/{t}"
    try:
        response = requests.get(url)
        response.raise_for_status()
    except HTTPError as hp:
        print(hp)
    result = pd.concat([result, get_cards_info(response, t)], ignore_index=True)

In [7]:
result.to_json("NerdWallet_creditcard.json", orient='records', lines=True)