In [245]:
import pandas as pd
import numpy as np

import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup

from datetime import datetime
from tqdm import tqdm
import requests
import time
import glob

# Gorilla Chart 수집

In [247]:
# 1년치 데이터 수집
start_date='2023-01-01'
end_date='2024-03-01'
date_list=pd.date_range(start=start_date, end=end_date, freq='M')
date_list

DatetimeIndex(['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31',
               '2024-01-31', '2024-02-29'],
              dtype='datetime64[ns]', freq='M')

## 1. 혜택별 Top10

```
   혜택별 숫자 매칭 
    dict = {'통신+공과금': [23,35],  
            '주유+차량정비': [21,118],  
            '쇼핑': [16],  
            '항공마일리지': [26],  
            '점심+교통': [5,137],  
            '무실적+모든가맹점': [11,12],  
            '구독/스트리밍': [169],  
            '해외직구': [27,151],  
            '배달앱+간편결제': [1,139],  
            '편의점+카페': [22,77,127],  
            '마트+교육비': [4,9,48,75],  
            '여행+바우처': [18,158],  
            '제휴/PLCC': [168],  
            '증권사CMA': [175]
            }
```

In [269]:
# 크롬창 띄우기
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--incognito') # 시크릿 모드로 열기
drv = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = chrome_options)

In [305]:
rank = [i for i in range(1, 11)]

for date in date_list:
    get_date = date.strftime('%Y-%m-%d')
    # sub={혜택별 dict값 변경}
    drv.get(f'https://card-gorilla.com/chart/benefit?term=monthly&date={get_date}&sub=175')
    time.sleep(2.5)
    
    soup = BeautifulSoup(drv.page_source, 'html.parser')
    ranks = soup.find_all(class_='rk_lst')
    
    name_list = []
    company_list = []
    href_list = []
    
    for i in range(len(ranks)):
        name_list.append(ranks[i].find(class_='card_name').text.strip())
        company_list.append(ranks[i].find(class_='corp_name').text)
        try:
            href_list.append('https://card-gorilla.com'+ranks[i].find('a')['href'])
        except:
            href_list.append('None')
            
    d_list = [get_date for i in range(10)]
    
    df = pd.DataFrame({'date':d_list, 'rank':rank, 'card_name':name_list, 
                       'card_company':company_list, 'card_href':href_list})
    
    if get_date == '2023-01-31':
        concat_df = df.copy()
    else:
        concat_df = pd.concat([concat_df, df], ignore_index=True)

drv.close()

In [308]:
concat_df

Unnamed: 0,date,rank,card_name,card_company,card_href
0,2023-01-31,1,부자되세요 더마일리지 체크카드,교보증권,https://card-gorilla.com/card/detail/1903
1,2023-01-31,2,able Premier Members 카드,KB증권,https://card-gorilla.com/card/detail/2225
2,2023-01-31,3,Win.K 체크카드,교보증권,https://card-gorilla.com/card/detail/1900
3,2023-01-31,4,able 카드 Ⅱ,KB증권,https://card-gorilla.com/card/detail/2219
4,2023-01-31,5,DB금융투자 해피플러스 체크카드,DB금융투자,https://card-gorilla.com/card/detail/2220
...,...,...,...,...,...
135,2024-02-29,6,미래에셋증권 체크카드 (캐시백 2),미래에셋증권,https://card-gorilla.com/card/detail/2217
136,2024-02-29,7,Win.K 체크카드,교보증권,https://card-gorilla.com/card/detail/1900
137,2024-02-29,8,한국투자 the More 체크카드,한국투자증권,https://card-gorilla.com/card/detail/2222
138,2024-02-29,9,able 아이맥스카드,KB증권,https://card-gorilla.com/card/detail/2223


In [309]:
# save
concat_df['promotion_cat'] = ['증권사CMA' for i in range(len(concat_df))]
concat_df.to_csv('project_data/증권사CMA_top10.csv', index=False, encoding='euc-kr')

## 2-1. 카드사별 Top10

```
    회사별 숫자 매칭
    dict = {'KB국민카드':3,
            '삼성카드':1,
            '롯데카드':4,
            'BC바로카드':32,
            '신한카드':2,
            '현대카드':7,
            '우리카드':5,
            'NH농협카드':9,
            '하나카드':8,
            'IBK기업은행':10
            }
```

In [158]:
# 크롬창 띄우기
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--incognito') # 시크릿 모드로 열기
drv = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = chrome_options)

In [201]:
rank = [i for i in range(1, 11)]

for date in date_list:
    get_date = date.strftime('%Y-%m-%d')
    # sub={회사별 dict값 변경}
    drv.get(f'https://card-gorilla.com/chart/corp?term=monthly&date={get_date}&sub=10')
    time.sleep(5)
    
    soup = BeautifulSoup(drv.page_source, 'html.parser')
    ranks = soup.find_all(class_='rk_lst')
    
    name_list = []
    company_list = []
    href_list = []
    
    for i in range(len(ranks)):
        name_list.append(ranks[i].find(class_='card_name').text.strip())
        company_list.append(ranks[i].find(class_='corp_name').text)
        try:
            href_list.append('https://card-gorilla.com'+ranks[i].find('a')['href'])
        except:
            href_list.append('None')
            
    d_list = [get_date for i in range(10)]
    
    df = pd.DataFrame({'date':d_list, 'rank':rank, 'card_name':name_list, 
                       'card_company':company_list, 'card_href':href_list})
    
    if get_date == '2023-01-31':
        concat_df = df.copy()
    else:
        concat_df = pd.concat([concat_df, df], ignore_index=True)
        
drv.close()

In [202]:
concat_df

Unnamed: 0,date,rank,card_name,card_company,card_href
0,2023-01-31,1,마일앤조이카드(대한항공),IBK기업은행,https://card-gorilla.com/card/detail/267
1,2023-01-31,2,BLISS.7 카드(마일리지),IBK기업은행,https://card-gorilla.com/card/detail/671
2,2023-01-31,3,ONE AIR(UniMile),IBK기업은행,https://card-gorilla.com/card/detail/475
3,2023-01-31,4,일상의 기쁨카드(신용),IBK기업은행,https://card-gorilla.com/card/detail/266
4,2023-01-31,5,마일앤조이카드(아시아나),IBK기업은행,https://card-gorilla.com/card/detail/268
...,...,...,...,...,...
135,2024-02-29,6,K-22(Mileage),IBK기업은행,https://card-gorilla.com/card/detail/2536
136,2024-02-29,7,마일앤조이카드(아시아나),IBK기업은행,https://card-gorilla.com/card/detail/268
137,2024-02-29,8,I-알뜰교통플러스 카드(신용),IBK기업은행,https://card-gorilla.com/card/detail/2559
138,2024-02-29,9,Daily With(데일리위드)카드,IBK기업은행,https://card-gorilla.com/card/detail/655


In [203]:
# save
concat_df.to_csv('project_data/IBK기업은행_top10.csv', index=False, encoding='euc-kr')

## 2-2. 카드별 혜택 정보 수집

In [169]:
df_list = glob.glob('project_data/*_top10.csv')
df_list

['project_data\\BC바로카드_top10.csv',
 'project_data\\IBK기업은행_top10.csv',
 'project_data\\KB국민카드_top10.csv',
 'project_data\\NH농협카드_top10.csv',
 'project_data\\롯데카드_top10.csv',
 'project_data\\삼성카드_top10.csv',
 'project_data\\신한카드_top10.csv',
 'project_data\\우리카드_top10.csv',
 'project_data\\하나카드_top10.csv',
 'project_data\\현대카드_top10.csv']

In [233]:
df = pd.read_csv(df_list[9], encoding='euc-kr')
df.drop_duplicates('card_href', inplace=True, ignore_index=True)
df = df[df['card_href'].isnull()==False]
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,date,rank,card_name,card_company,card_href
0,2023-01-31,1,현대카드ZERO Edition2(할인형),현대카드,https://card-gorilla.com/card/detail/608
1,2023-01-31,2,현대카드 M BOOST,현대카드,https://card-gorilla.com/card/detail/700
2,2023-01-31,3,네이버 현대카드,현대카드,https://card-gorilla.com/card/detail/2233
3,2023-01-31,4,American Express® Gold Card,현대카드,https://card-gorilla.com/card/detail/2281
4,2023-01-31,5,대한항공카드 030,현대카드,https://card-gorilla.com/card/detail/600
5,2023-01-31,6,the Green Edition2,현대카드,https://card-gorilla.com/card/detail/2260
6,2023-01-31,7,the Red Edition5,현대카드,https://card-gorilla.com/card/detail/2259
7,2023-01-31,8,현대카드Z family,현대카드,https://card-gorilla.com/card/detail/730
8,2023-01-31,9,현대카드ZERO Edition2(포인트형),현대카드,https://card-gorilla.com/card/detail/610
9,2023-01-31,10,현대카드 X BOOST,현대카드,https://card-gorilla.com/card/detail/701


In [217]:
# 크롬창 띄우기
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--incognito') # 시크릿 모드로 열기
drv = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = chrome_options)

In [234]:
for index, href in tqdm(enumerate(df['card_href'])):
    drv.get(href)
    time.sleep(3.5)
    soup = BeautifulSoup(drv.page_source, 'html.parser')
    
    # event
    try:
        event = soup.find(class_='event_txt').text
    except:
        event = 'None'
    
    # promotion
    promotions_list = soup.find(class_='bnf1').find_all('dl')
    promotions = ''
    for i in range(len(promotions_list)):
        prm = promotions_list[i].find('dt').text + ' ' + promotions_list[i].find('dd').text
        prm.replace('최대', '').replace(',', '').strip().strip()
        promotions += prm + ' && '
    promotions = promotions[:-3].strip()
    
    # 연회비, 실적, 카드종류
    bottoms = soup.find(class_='bnf2').find_all('dl')
    
    fee = bottoms[0].find(class_='in_out').text.replace(',', '')
    
    perform = bottoms[1].text.replace('\n', '').replace('\t', '')
    
    brand_list = bottoms[2].find(class_='c_brand').find_all('span')
    brand = ''
    for text in brand_list:
        brand += text.text
        brand += ' '
    brand = brand.strip()
    
    dd = pd.DataFrame({'card_company': [df['card_company'][index]], 'card_name': [df['card_name'][index]], 'event': [event], 
                       'promotions': [promotions], 'fee': [fee], 'perform': [perform], 'brand': [brand]})
    
    if index==0:
        concat_df = dd
    else:
        concat_df = pd.concat([concat_df, dd], ignore_index=True)
    
    time.sleep(2)
    
drv.close()

12it [01:17,  6.42s/it]


In [235]:
concat_df

Unnamed: 0,card_company,card_name,event,promotions,fee,perform,brand
0,현대카드,현대카드ZERO Edition2(할인형),,모든가맹점 0.7%할인 && 생활필수영역 1.5%할인,국내전용 10000원 해외겸용 10000원,전월실적없음,VISA
1,현대카드,현대카드 M BOOST,신규회원 연회비 캐시백 이벤트,업종별 0.5~3%기본적립 && 온라인 간편결제 5%적립 && 해외가맹점 5%적립,국내전용 30000원 해외겸용30000원,전월실적50만원 이상,VISA
2,현대카드,네이버 현대카드,,네이버플러스멤버십 무료이용권 && 네이버플러스멤버십 회원 5%적립 && 그외가맹점 ...,국내전용 10000원 해외겸용 10000원,전월실적30만원 이상,VISA
3,현대카드,American Express® Gold Card,,매년 연간스페셜 3만MR적립(30만원 상당) && 공항라운지/발레파킹 무료이용 &&...,해외겸용 300000원,전월실적없음,AMEX
4,현대카드,대한항공카드 030,,국내외가맹점 1마일리지적립 && 특정업종 2마일리지적립 && 웰컴보너스 3천마일리지적립,국내전용 30000원 해외겸용 30000원,전월실적없음,mastercard
5,현대카드,the Green Edition2,신규회원 Welcome 이벤트,바우처 10만원제공 && 공항라운지 무료이용 && M포인트 최대 2%적립,국내전용 150000원 해외겸용 150000원,전월실적50만원 이상,VISA
6,현대카드,the Red Edition5,,바우처 최대 20만원제공 && 공항라운지 무료이용 && M포인트 최대 2%적립,국내전용 300000원 해외겸용 300000원,전월실적50만원 이상,VISA
7,현대카드,현대카드Z family,,온라인쇼핑 10%할인 && 배달앱 10%할인 && 주유소 100원/L할인,국내전용 10000원 해외겸용 10000원,전월실적40만원 이상,VISA
8,현대카드,현대카드ZERO Edition2(포인트형),,국내외 가맹점 1%기본적립 && 생활필수영역 2.5%적립,국내전용 10000원 해외겸용 10000원,전월실적없음,VISA
9,현대카드,현대카드 X BOOST,,모든가맹점 1~1.5%기본할인 && 온라인 간편결제 5%할인 && 해외가맹점 5%할인,국내전용 30000원 해외겸용30000원,전월실적50만원 이상,VISA


In [236]:
# save
concat_df.to_csv('project_data/현대카드_info.csv', encoding='euc=kr', index=False)

## 파일 합치기

In [318]:
# 전체 카드 정보 및 순위 파일 합치기
# df_list = glob.glob('project_data/*_info.csv')
df_list = glob.glob('project_data/*_top10.csv')

for i in range(len(df_list)):
    if i == 0:
        all_df = pd.read_csv(df_list[i], encoding='euc-kr')
    if df_list[i] == 'project_data\\여행_바우처_top10.csv':
        all_df = pd.concat([all_df, pd.read_csv(df_list[i])], ignore_index=True)
    else:
        all_df = pd.concat([all_df, pd.read_csv(df_list[i], encoding='euc-kr')], ignore_index=True)

all_df.to_csv('project_data/card_promotion_top10_all.csv', index=False)