In [17]:
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
from selenium import webdriver
from IPython.display import clear_output
import time

In [18]:
championship = 'ЮФЛ'
COLUMNS = ['чемпионат', 'сезон', 'турнир', 'тур', 'месяц матча', 'матч', 'хозяева', 'хозяева голы',
           'гости голы', 'гости', 'гости\\хозяева', 'результат', 'id_матча',
           'id_результата', 'амплуа', 'фио', 'id_фио', 'имя', 'фамилия', 'ссылка игрока',
           'номер игрока', 'дата рождения', 'месяц рождения', 'старт\\запас',
           'минута выхода', 'минута ухода', 'минут на поле', 'гол', 'ассист',
           'жк', 'кк', 'пенальти', 'автогол', 'команда', 'id_команды', 'CategoryColumn', 'судья']
df_empty = pd.DataFrame(columns=COLUMNS)

In [19]:
def monthchanger(month):
    if month == 'Января':
        month = 'Январь'
    elif month == 'Февраля':
        month = 'Февраль'
    elif month == 'Марта':
        month = 'Март'
    elif month == 'Апреля':
        month = 'Апрель'
    elif month == 'Мая':
        month = 'Май'
    elif month == 'Июня':
        month = 'Июнь'
    elif month == 'Июля':
        month = 'Июль'
    elif month == 'Августа':
        month = 'Август'
    elif month == 'Сентября':
        month = 'Сентябрь'
    elif month == 'Октября':
        month = 'Октябрь'
    elif month == 'Ноября':
        month = 'Ноябрь'
    elif month == 'Декабря':
        month = 'Декабрь'
    return month

def monthfromdigit(month):
    if month == '01':
        month = 'Январь'
    elif month == '02':
        month = 'Февраль'
    elif month == '03':
        month = 'Март'
    elif month == '04':
        month = 'Апрель'
    elif month == '05':
        month = 'Май'
    elif month == '06':
        month = 'Июнь'
    elif month == '07':
        month = 'Июль'
    elif month == '08':
        month = 'Август'
    elif month == '09':
        month = 'Сентябрь'
    elif month == '10':
        month = 'Октябрь'
    elif month == '11':
        month = 'Ноябрь'
    elif month == '12':
        month = 'Декабрь'
    return month

def month_from_datetime(datetime):
    date = datetime.split('/')[0]
    month = monthchanger(date.split()[1])
    return month

def reversename(name):
    first = name.split()[-1]
    second = ' '.join(name.split()[:-1])
    return first + ' ' + second

def fullrole(role):
    if role == 'Вр.':
        return 'Вратарь'
    if role == 'Зщ.':
        return 'Защитник'
    if role == 'Пз.':
        return 'Полузащитник'
    if role == 'Нп.':
        return 'Нападающий'
    return np.nan

def update_teamsdict(team, teams_dict):
    id_team = teams_dict.get(team, len(teams_dict) + 1)
    teams_dict[team] = id_team
    return id_team

def update_playersdict(player, players_dict):
    id_player = players_dict.get(player, len(players_dict) + 1)
    players_dict[player] = id_player
    return id_player

In [20]:
def parse_team(team_link):
    team_page = request_page(team_link)
    team_html = BeautifulSoup(team_page.text, 'html.parser')
    bd_dict = {}
    
    player_lings = team_html('tr', class_='table__row')
    for player_ling in player_lings:
        try:
            name = reversename(player_ling.find('p', class_='table__player-name').string.strip())
        except:
            continue
        bd_date = player_ling.find('td', class_='table__cell table__cell--middle mobile-hide').string.strip()
        if bd_date == '':
            continue
        bd_day = bd_date[:2]
        bd_month = bd_date[3:5]
        bd_year = bd_date[6:10]
        bd_dict[name] = [bd_day + '.' + bd_month + '.' + bd_year, monthfromdigit(bd_month)]
        
    staff_lings = team_html('li', class_='composition-list__item composition-list__item-flip-container with-stats')
    for staff_ling in staff_lings:
        firstname = staff_ling.find('span', class_='composition-list__player-first-name').string.strip()
        secondname = staff_ling.find('span', class_='composition-list__player-last-name').string.strip()
        name = firstname + ' ' + secondname
        try:
            bd_date = staff_ling.find('span', class_='composition-list__player-birth-date').string.strip()
            bd_day = bd_date[:2]
            bd_month = bd_date[3:5]
            bd_year = bd_date[6:10]
            bd_dict[name] = [bd_day + '.' + bd_month + '.' + bd_year, monthfromdigit(bd_month)]
        except:
            continue
        
    return bd_dict

def get_result(score1, score2):
    if score1 > score2:
        result1 = 'Победа'
        result2 = 'Поражение'
    elif score2 > score1:
        result2 = 'Победа'
        result1 = 'Поражение'
    else:
        result1 = result2 = 'Ничья'
    return result1, result2

def get_playerinfo(player_ling, players_dict):
    
    fullname = player_ling.a.string.strip()
    id_player = update_playersdict(fullname, players_dict)
    firstname = fullname.split()[0]
    secondname = ' '.join(fullname.split()[1:])
    
    #player_link = yfl + player_ling.a.get('href')
    player_link = player_ling.a.get('href')
    try:
        role = fullrole(player_ling.find('span', class_='match-protocol__member-amplua').string.strip())
    except:
        role = np.nan
    try:
        number = int(player_ling.find('span', class_='match-protocol__member-number').string.strip())
    except:
        number = np.nan
    
    return role, fullname, id_player, firstname, secondname, player_link, number

def get_playertime(player_ling, main):
    if main == 'Старт':
        starttime = 0
        end = player_ling.find('li', class_='event-item event-item-subs event-item-subs--out')
        if end and len(end.string.strip()) > 1:
            endtime = int(end.string.strip()[:-1])
        else:
            endtime = 90
    elif main == 'Запас':
        start = player_ling.find('li', class_='event-item event-item-subs event-item-subs--in')
        if start and len(start.string.strip()) > 1:
            starttime = int(start.string.strip()[:-1])
            end = player_ling.find('li', class_='event-item event-item-subs event-item-subs--out')
            if end and len(end.string.strip()) > 1:
                endtime = int(end.string.strip()[:-1])
            else:
                endtime = 90
        else:
            starttime = np.nan
            endtime = np.nan
    playtime = endtime - starttime
    return starttime, endtime, playtime

def get_playerevents(player_ling):
    goals = assists = penalties = yc = rc = autogoals = 0
    events = player_ling('svg')
    for event in events:
        kind = event.get('class')[0]
        try:
            if 'event-item--number' in event.parent.next_sibling.next_sibling.get('class'):
                n = int(event.parent.next_sibling.next_sibling.string.strip()[1:])
            else:
                n = 1
        except:
            n = 1
        if kind == 'goal':
            goals = n
        elif kind == 'assist':
            assists = n
        elif kind == 'penalty':
            penalties = n
        elif kind == 'yellow-card':
            yc = n
        elif kind == 'red-card':
            rc = n
        elif kind == 'autogoal':
            autogoals = n
    return goals, assists, yc, rc, penalties, autogoals

def get_stafferinfo(staffer_ling, players_dict):
    
    fullname = staffer_ling.find('p', class_='match-protocol__staff-name').string.strip()
    firstname = fullname.split()[-2]
    secondname = ' '.join(fullname.split()[:-2])
    fullname = firstname + ' ' + secondname
    id_staffer = update_playersdict(fullname, players_dict)
    
    #staffer_link = yfl + staffer_ling.a.get('href')
    staffer_link = staffer_ling.a.get('href')
    role = staffer_ling.find('p', class_='match-protocol__staff-position').string.strip()
    
    return role, fullname, id_staffer, firstname, secondname, staffer_link

In [31]:
def parse_match(match_link, season=None, id_match=1, players_dict={}, teams_dict={}):
    
    match_page = request_page(match_link)
    match_html = BeautifulSoup(match_page.text, 'html.parser')
    
    match_link_short = match_html.find('a', class_='match-promo__score').get('href')
    
    n = 0
    df = df_empty.copy()
        
    tournament = match_html.find('h2', class_='tournament__title').string.strip()
    season_fullname = match_html.find('li', class_='tournament__info-item tournament__info-dates').string.strip()
    season = season or (season_fullname.split()[2] + '-' + season_fullname.split()[6])
    
    datetime = match_html.find('div', class_='match-promo__date-time').string.strip()
    month = month_from_datetime(datetime)
    
    team1_ling = match_html.find('div', class_='match-promo__team-container match-promo__team-container--left')
    team1 = team1_ling.find('a', class_='match-promo__team-name').string.strip()
    team2_ling = match_html.find('div', class_='match-promo__team-container match-promo__team-container--right')
    team2 = team2_ling.find('a', class_='match-promo__team-name').string.strip()
    score = match_html.find('div', class_='match-promo__score-main').string.strip()
    match = team1 + ' ' + score + ' ' + team2
    
    id_team1 = update_teamsdict(team1, teams_dict)
    id_team2 = update_teamsdict(team2, teams_dict)
    
    score1 = int(score.split()[0])
    score2 = int(score.split()[-1])
    result1, result2 = get_result(score1, score2)
    
    if match_html.find('a', class_='match-promo__tour'):
        tour_fullname = match_html.find('a', class_='match-promo__tour').string.strip()
        tour = tour_fullname[:-4]
    else:
        tour = 1

    referee_ling = match_html.find('div', class_='match-protocol__referee')
    if referee_ling:
        referee = referee_ling('span')[0].string.strip() + ' ' + referee_ling('span')[1].string.strip()
    else:
        referee = np.nan
    
    additional = match_html.find('div', class_='match-promo__score-additional')
    if additional and additional.string.strip() == 'ТП':
        df.loc[0] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2, 'Хозяева',
                     result1, id_match, 2*id_match - 1, 'ТП'] + [np.nan] * 18 + [team1, id_team1, match_link_short, referee]
        df.loc[1] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2, 'Гости',
                     result2, id_match, 2*id_match, 'ТП'] + [np.nan] * 18 + [team2, id_team2, match_link_short, referee]
        return df
    
    team1_link = yfl + team1_ling.a.get('href')
    bdteam1_dict = parse_team(team1_link)

    team2_link = yfl + team2_ling.a.get('href')
    bdteam2_dict = parse_team(team2_link)
        
    main_ling = match_html.find('div', class_='match-protocol__composition')
    main = 'Старт'
    playerteam1_lings = main_ling('li', class_='match-protocol__member match-protocol__member--left')
    for player_ling in playerteam1_lings:
        
        role, fullname, id_player, firstname, secondname, player_link, number = get_playerinfo(player_ling, players_dict)
        bd_date, bd_month = bdteam1_dict.get(fullname, (np.nan, np.nan))
        starttime, endtime, playtime = get_playertime(player_ling, main)
        goals, assists, yc, rc, penalties, autogoals = get_playerevents(player_ling)
        
        df.loc[n] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2,
                     'Хозяева', result1, id_match, 2*id_match - 1, role, fullname, id_player, firstname,
                     secondname, player_link, number, bd_date, bd_month, main, starttime, endtime, playtime,
                     goals, assists, yc, rc, penalties, autogoals, team1, id_team1, match_link_short, referee]
        
        n += 1
        
    playerteam2_lings = main_ling('li', class_='match-protocol__member match-protocol__member--right')
    for player_ling in playerteam2_lings:

        role, fullname, id_player, firstname, secondname, player_link, number = get_playerinfo(player_ling, players_dict)
        bd_date, bd_month = bdteam2_dict.get(fullname, (np.nan, np.nan))
        starttime, endtime, playtime = get_playertime(player_ling, main)
        goals, assists, yc, rc, penalties, autogoals = get_playerevents(player_ling)

        df.loc[n] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2,
                     'Гости', result2, id_match, 2*id_match, role, fullname, id_player, firstname,
                     secondname, player_link, number, bd_date, bd_month, main, starttime, endtime, playtime,
                     goals, assists, yc, rc, penalties, autogoals, team2, id_team2, match_link_short, referee]
        n += 1
    
    
    reserve_ling = main_ling.find_next('div', class_='match-protocol__composition')
    main = 'Запас'
    playerteam1_lings = reserve_ling('li', class_='match-protocol__member match-protocol__member--left')
    for player_ling in playerteam1_lings:

        role, fullname, id_player, firstname, secondname, player_link, number = get_playerinfo(player_ling, players_dict)
        bd_date, bd_month = bdteam1_dict.get(fullname, (np.nan, np.nan))
        starttime, endtime, playtime = get_playertime(player_ling, main)
        goals, assists, yc, rc, penalties, autogoals = get_playerevents(player_ling)

        df.loc[n] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2,
                     'Хозяева', result1, id_match, 2*id_match - 1, role, fullname, id_player, firstname,
                     secondname, player_link, number, bd_date, bd_month, main, starttime, endtime, playtime,
                     goals, assists, yc, rc, penalties, autogoals, team1, id_team1, match_link_short, referee]

        n += 1
    playerteam2_lings = reserve_ling('li', class_='match-protocol__member match-protocol__member--right')
    for player_ling in playerteam2_lings:

        role, fullname, id_player, firstname, secondname, player_link, number = get_playerinfo(player_ling, players_dict)
        bd_date, bd_month = bdteam2_dict.get(fullname, (np.nan, np.nan))
        starttime, endtime, playtime = get_playertime(player_ling, main)
        goals, assists, yc, rc, penalties, autogoals = get_playerevents(player_ling)

        df.loc[n] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2,
                     'Гости', result2, id_match, 2*id_match, role, fullname, id_player, firstname,
                     secondname, player_link, number, bd_date, bd_month, main,  starttime, endtime, playtime,
                     goals, assists, yc, rc, penalties, autogoals, team2, id_team2, match_link_short, referee]
        n += 1

    staff_ling = reserve_ling.find_next('div', class_='match-protocol__composition')
    main = 'Штаб'
    if staff_ling is None:
        return df
    stafferteam1_lings = staff_ling.find('ul', class_='match-protocol__team match-protocol__team--left')('li')
    for staffer_ling in stafferteam1_lings:

        if 'match-protocol-empty' in staffer_ling.get('class') or 'event-item' in staffer_ling.get('class') \
                                or 'match-protocol__staff--empty' in staffer_ling.get('class'):
            continue

        role, fullname, id_staffer, firstname, secondname, staffer_link = get_stafferinfo(staffer_ling, players_dict)
        bd_date, bd_month = bdteam1_dict.get(fullname, (np.nan, np.nan))

        df.loc[n] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2,
                     'Хозяева', result1, id_match, 2*id_match - 1, role, fullname, id_staffer, firstname,
                     secondname, staffer_link, np.nan, bd_date, bd_month, main] + [np.nan] * 9 + \
                     [team1, id_team1, match_link_short, referee]
        n += 1
        
    stafferteam2_lings = staff_ling.find('ul', class_='match-protocol__team match-protocol__team--right')('li')
    for staffer_ling in stafferteam2_lings:

        if 'match-protocol-empty' in staffer_ling.get('class') or 'event-item' in staffer_ling.get('class') \
                                or 'match-protocol__staff--empty' in staffer_ling.get('class'):
            continue

        role, fullname, id_staffer, firstname, secondname, staffer_link = get_stafferinfo(staffer_ling, players_dict)
        bd_date, bd_month = bdteam2_dict.get(fullname, (np.nan, np.nan))

        df.loc[n] = [championship, season, tournament, tour, month, match, team1, score1, score2, team2,
                     'Гости', result2, id_match, 2*id_match, role, fullname, id_staffer, firstname,
                     secondname, staffer_link, np.nan, bd_date, bd_month, main] + [np.nan] * 9 + \
                     [team2, id_team2, match_link_short, referee]
        n += 1
        
    df['гол'] += df['пенальти']
    
    #assign autogoals to opposite team
    for _, player_autogoal in df[df['автогол'] >= 1].iterrows():
        autogoals = player_autogoal['автогол']
        id_result = player_autogoal['id_результата']
        if id_result % 2 == 0:
            id_result -= 1
        else:
            id_result += 1
            
        df_antiautogoal = df[df['id_результата'] == id_result]
        players_antiautogoal = df_antiautogoal[df_antiautogoal['амплуа'] == 'Нападающий']
        if len(players_antiautogoal) == 0:
            players_antiautogoal = df_antiautogoal[df_antiautogoal['амплуа'] == 'Полузащитник']
        if len(players_antiautogoal) == 0:
            players_antiautogoal = df_antiautogoal[df_antiautogoal['амплуа'] == 'Защитник']
        if len(players_antiautogoal) == 0:
            player_antiautogoal = df_antiautogoal.iloc[1].name
        else:
            player_antiautogoal = players_antiautogoal.iloc[0].name
        
        df.at[player_antiautogoal, 'гол'] += 1
        
    return df

In [32]:
def parse_tournament(tournament_link, season=None, df_old=None):
    
    if tournament_link.endswith('tables'):
        tournament_link = tournament_link[:-7]
    
    tournament_page = request_page(tournament_link + '/calendar')
    tournament_html = BeautifulSoup(tournament_page.text, 'html.parser')
    df = df_empty.copy()
    
    tournament = tournament_html.find('h2', class_='tournament__title').string.strip()
    season_fullname = tournament_html.find('li', class_='tournament__info-item tournament__info-dates').string.strip()
    season = season or (season_fullname.split()[2] + '-' + season_fullname.split()[6])
    
    if df_old is not None and len(df_old) > 0:
        id_match = max(df_old['id_матча']) + 1
        players_dict = df_old[['фио', 'id_фио']].set_index('фио').T.to_dict('list')
        players_dict = {fullname: players_dict[fullname][0] for fullname in players_dict}
        
        df_old_tournament = df_old[(df_old['сезон'] == season) & (df_old['турнир'] == tournament)]
        old_unique_match = set(df_old_tournament['матч'])
        teams_dict = df_old_tournament[['команда', 'id_команды']].set_index('команда').T.to_dict('list')
        teams_dict = {team: teams_dict[team][0] for team in teams_dict}
    else:
        id_match = 1
        old_unique_match = None
        players_dict = {}
        teams_dict = {}
    
    match_lings = tournament_html('li', 'schedule__matches-item js-calendar-match js-calendar-last-match')

    for match_ling in tqdm(match_lings):
             
        if old_unique_match is not None:
            team1 = match_ling.find('span', class_='schedule__team-name schedule__team-name--right schedule__team-short-name').string.strip()
            team2 = match_ling.find('span', class_='schedule__team-name schedule__team-name--left schedule__team-short-name').string.strip()
            score = match_ling.find('div', class_='schedule__score-main').string.strip()
            match = team1 + ' ' + score + ' ' + team2
            if match in old_unique_match:
                continue
        
        match_link = yfl + match_ling.find('a', class_='schedule__score').get('href')
        df_match = parse_match(match_link, season, id_match, players_dict, teams_dict)
        df = df.append(df_match, ignore_index=True)
        
        id_match += 1
    return df

In [33]:
def preprocessing():
    if 'yfl.xlsx' not in os.listdir():
        return df_empty.copy()
    df = pd.read_excel('yfl.xlsx').iloc[:, 1:]
    return df

def request_page(link, n=5, t_sleep=4):
    for _ in range(n):
        try:
            page = requests.get(link)
            return page
        except:
            time.sleep(t_sleep)
    raise ConnectionError
            
def find_season_lings(main_link):
    try:
        driver = webdriver.Firefox()
        driver.get(main_link)
        main_html = BeautifulSoup(driver.page_source, 'html.parser')
    finally:
        driver.close()
    season_lings = main_html('li', class_='js-div-dropdown-li')
    return season_lings[::-1]

def parse_season(season_link):
    season_page = request_page(season_link)
    season_html = BeautifulSoup(season_page.text, 'html.parser')
    tournament_ling = season_html.find('li', class_='tournaments-archive__item')
    while tournament_ling is not None:
        yield tournament_ling
        tournament_ling = tournament_ling.find_next('li', class_='tournaments-archive__item')

In [34]:
championship = 'ЮФЛ'
yfl = 'https://yflrussia.ru'
main_link = yfl + '/tournaments'

In [35]:
print('Предобработка')
#df = preprocessing()

step = 0
season_lings = find_season_lings(main_link)
for season_ling in season_lings:
    season = season_ling.a.string.strip().split('/')
    season = season[0] + '-20' + season[1]
    path = 'yfl' + '_s' + season + '.xlsx'
    df = pd.read_excel(path).iloc[:, 1:] if path in os.listdir() else df_empty.copy()
    season_link = yfl + season_ling.a.get('href')
    tournament_lings = parse_season(season_link)
    for tournament_ling in tournament_lings:
        step += 1
        tournament = tournament_ling.find('p', class_='tournaments-archive__title').string.strip()
        clear_output()
        print('Шаг: %d' % step, 'Сезон: %s' % season, 'Турнир: %s' % tournament, sep='\n')
        tournament_url = tournament_ling.find('a', class_='tournaments-archive__link').get('href')
        tournament_link = yfl + tournament_url
        df_tournament = parse_tournament(tournament_link, season, df_old=df)
        if len(df_tournament) > 0:
            df = df.append(df_tournament, ignore_index=True)
            df.to_excel(path)
print('ALL DONE')

Шаг: 1
Сезон: 2019/20
Турнир: Юношеская футбольная лига. Сезон 2019/20


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [02:27<00:00,  1.63s/it]


FileNotFoundError: [Errno 2] No such file or directory: 'yfl_s2019/20.xlsx'