In [30]:
import time 
import requests, bs4
import re
import sys
import os
import datetime
from urllib.parse import urlparse

import pandas as pd
import numpy as np
from omegaconf import OmegaConf

sys.path.append("../")
from src.scraping import ScrapingBase, ScrapingSponavi

%matplotlib inline
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

# 出力先
conf_dir = "../config"
conf_cli = OmegaConf.from_cli()
conf_exec = OmegaConf.load(os.path.join(conf_dir, "config_exec.yaml"))
conf_path = OmegaConf.load(os.path.join(conf_dir, "config_path.yaml"))
conf_url = OmegaConf.load(os.path.join(conf_dir, "config_url.yaml"))
conf_team = OmegaConf.load(os.path.join(conf_dir, "config_team.yaml"))
config = OmegaConf.merge(conf_cli, conf_exec, conf_path, conf_url, conf_team)

In [60]:
ss = ScrapingSponavi(config=config,start_date="2021-03-21", end_date="2021-03-21")

In [63]:
url = "https://baseball.yahoo.co.jp/npb/game/2021000092/top"

In [64]:
html = ss.get_html(url)

get html  https://baseball.yahoo.co.jp/npb/game/2021000092/top
success


In [93]:
soup = bs4.BeautifulSoup(html, "html.parser")

# 結果を格納する辞書
result = {}

# ゲームID
result["game_id"] = "npb" + url.split("/")[-2]

# 日付
game_date = soup.select_one("title").get_text().split(" ")[0]
result["game_date"] = datetime.datetime.strptime(game_date, "%Y年%m月%d日").strftime("%Y-%m-%d")

# ステータス
state_original = soup.select_one('span.bb-gameCard__state').get_text(strip=True)
if state_original=="試合終了":
    result["status"] = "finish"
elif state_original == "試合中止":
    result["status"] = "cancel"
elif state_original == "試合前":
    result["status"] = "before"
else:
    result["status"] = "unkown"

# 球場
description = soup.select_one("p[class='bb-gameDescription']")
result["team_top_name"] = description.get_text().split("\n")[3].replace(" ","")

# 試合開始時間
description = soup.select_one("p[class='bb-gameDescription']")
result["start_time"] = description.get_text().split("\n")[2].replace(" ","")

# チーム名
teams = [x.get_text(strip=True) for x in soup.select("a.bb-gameScoreTable__team")]
result["team_top_name"] = teams[0]
result["team_bottom_name"] = teams[1]
result["team_top_id"] = config.team[teams[0]].team_id
result["team_bottom_id"] = config.team[teams[1]].team_id

# 合計点
scores = [x.get_text(strip=True) for x in soup.select("td[class='bb-gameScoreTable__total']")]
result["score_top"] = scores[0]
result["score_bottom"] = scores[1]

# 安打数
hits = [x.get_text(strip=True) for x in soup.select("td[class='bb-gameScoreTable__total bb-gameScoreTable__data--hits']")]
result["hit_top"] = hits[0]
result["hit_bottom"] = hits[1]

# 失策数
hits = [x.get_text(strip=True) for x in soup.select("td[class='bb-gameScoreTable__total bb-gameScoreTable__data--loss']")]
result["error_top"] = hits[0]
result["error_bottom"] = hits[1]

# 責任投手
soup_pick = soup.select_one("section[id='pit_rec']")
pitchers = [x for x in soup_pick.select("td.bb-gameTable__data")]
list_pitcher_ids = []
for pitcher in pitchers:
    if pitcher.get_text(strip=True) != "":
        pitcher_url = pitcher.select_one("a[class='bb-gameTable__player']")
        list_pitcher_ids.append(ss.get_id(pitcher_url.get("href").split("/")[-2]))
    else:
        list_pitcher_ids.append(np.nan)
result["picher_win_id"] = list_pitcher_ids[0]
result["picher_lose_id"] = list_pitcher_ids[1]
result["picher_save_id"] = list_pitcher_ids[2]

# 審判
soup_pick = soup.select("section[class='bb-modCommon01']")[-2]
data = [x.get_text(strip=True) for x in soup_pick.select("td.bb-tableLeft__data")]
result["umpire_plate"] = data[0]
result["umpire_first"] = data[1]
result["umpire_second"] = data[2]
result["umpire_third"] = data[3]

# 観客数/試合時間
soup_pick = soup.select("section[class='bb-modCommon01']")[-1]
data = [x.get_text(strip=True) for x in soup_pick.select("td.bb-tableLeft__data")]
result["audience"] = data[0]
result["game_time"] = data[1]


result

{'game_id': 'npb2021000092',
 'game_date': '2021-03-21',
 'status': 'finish',
 'team_top_name': '日本ハム',
 'start_time': '13:00',
 'team_bottom_name': '中日',
 'team_top_id': 'npb8',
 'team_bottom_id': 'npb4',
 'score_top': '2',
 'score_bottom': '9',
 'hit_top': '4',
 'hit_bottom': '16',
 'error_top': '2',
 'error_bottom': '1',
 'picher_win_id': 'npb1500074',
 'picher_lose_id': 'npb1700029',
 'picher_save_id': nan,
 'umpire_plate': '山本貴',
 'umpire_first': '津川',
 'umpire_second': '川口',
 'umpire_third': '原',
 'audience': '9,946人',
 'game_time': '3時間11分'}

In [None]:
url_schedule_day = 'https://baseball.yahoo.co.jp/npb/schedule/

In [20]:
import re

content = "https://baseball.yahoo.co.jp/npb/game/2021000011/index"
result = re.search("\d+", content)
if result:
    print(result.group())

2021000011


In [8]:
print(url)
index = re.search(r'index=\d+', url)
res = requests.get(url)

try:
    res.raise_for_status() 
except requests.exceptions.RequestException:
    print('Not Found Error in {}'.format(index.group())) #Not Foundエラーが出たらその回は終わったとみなす
    #break

soup = bs4.BeautifulSoup(res.text, "html.parser")
elems = soup.select('.bb-calendarTable__status')
elems2 = soup.select('.bb-calendarTable__venue')

#print(elems)
for elem,elem2 in zip(elems,elems2):
    url = elem.get('href')
    date = url[38:46]
    venue = elem2.text
    #print(venue)
    game_yokohama.loc[len(game_yokohama)] = [url,date,venue]
time.sleep(1)

NameError: name 'url' is not defined