In [10]:
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup
from tqdm import tqdm
import os

In [2]:
def monthchanger(month):
    if month == 'января':
        month = 'Январь'
    elif month == 'февраля':
        month = 'Февраль'
    elif month == 'марта':
        month = 'Март'
    elif month == 'апреля':
        month = 'Апрель'
    elif month == 'мая':
        month = 'Май'
    elif month == 'июня':
        month = 'Июнь'
    elif month == 'июля':
        month = 'Июль'
    elif month == 'августа':
        month = 'Август'
    elif month == 'сентября':
        month = 'Сентябрь'
    elif month == 'октября':
        month = 'Октябрь'
    elif month == 'ноября':
        month = 'Ноябрь'
    elif month == 'декабря':
        month = 'Декабрь'
    return month

def splitdate(date):
    date_split = date.split()
    if len(date_split) == 4:
        day, month, year, weekday = date_split
        year = year[:-1]
    else:
        day, month, weekday = date_split
        month = month[:-1]
        year = '2020'
    month = monthchanger(month)
    return day, month, year, weekday

def fullrole(role):
    if role == 'Вр.':
        return 'Вратарь'
    if role == 'Зщ.':
        return 'Защитник'
    if role == 'Пз.':
        return 'Полузащитник'
    if role == 'Нп.':
        return 'Нападающий'
    return np.nan

def reversename(name):
    first = name.split()[-1]
    second = ' '.join(name.split()[:-1])
    return first + ' ' + second

def create_bddict(soup):
    bd_dict = {}
    players = soup('tr', class_='table__row')
    for player in players:
        try:
            name = reversename(player.find('p', class_='table__player-name').string.strip())
        except:
            continue
        bd_date = player.find('td', class_='table__cell table__cell--birth-date mobile-hide').string.strip()
        if bd_date == '':
            bd_dict[name] = [np.nan, np.nan]
            continue
        bd_day = bd_date[:2]
        bd_month = bd_date[3:5]
        bd_year = bd_date[6:10]
        if bd_month == '01':
            bd_month = 'января'
        elif bd_month == '02':
            bd_month = 'февраля'
        elif bd_month == '03':
            bd_month = 'марта'
        elif bd_month == '04':
            bd_month = 'апреля'
        elif bd_month == '05':
            bd_month = 'мая'
        elif bd_month == '06':
            bd_month = 'июня'
        elif bd_month == '07':
            bd_month = 'июля'
        elif bd_month == '08':
            bd_month = 'августа'
        elif bd_month == '09':
            bd_month = 'сентября'
        elif bd_month == '10':
            bd_month = 'октября'
        elif bd_month == '11':
            bd_month = 'ноября'
        elif bd_month == '12':
            bd_month = 'декабря'
        bd_dict[name] = [bd_day + ' ' + bd_month + ' ' + bd_year, monthchanger(bd_month)]
    return bd_dict

def get_result(score1, score2):
    if score1 > score2:
        result1 = 'Победа'
        result2 = 'Поражение'
    elif score2 > score1:
        result2 = 'Победа'
        result1 = 'Поражение'
    else:
        result1 = result2 = 'Ничья'
    return result1, result2

def get_idteam(team, teams_dict):
    id_team = teams_dict.get(team)
    if id_team is None:
        teams_dict['COUNTER'] += 1
        id_team = teams_dict['COUNTER']
        teams_dict[team] = id_team
    return id_team

def get_idplayer(player, players_dict):
    id_player = players_dict.get(player)
    if id_player is None:
        players_dict['COUNTER'] += 1
        id_player = players_dict['COUNTER']
        players_dict[player] = id_player
    return id_player

def get_playertime(player, main):
    if main == 'Старт':
        starttime = 0
        end = player.find('li', class_='event-item event-item-subs event-item-subs--out')
        if end and len(end.string.strip()) > 1:
            endtime = int(end.string.strip()[:-1])
        else:
            endtime = 90
    else:
        start = player.find('li', class_='event-item event-item-subs event-item-subs--in')
        if start and len(start.string.strip()) > 1:
            starttime = int(start.string.strip()[:-1])
            end = player.find('li', class_='event-item event-item-subs event-item-subs--out')
            if end and len(end.string.strip()) > 1:
                endtime = int(end.string.strip()[:-1])
            else:
                endtime = 90
        else:
            starttime = np.nan
            endtime = np.nan
    playtime = endtime - starttime
    return starttime, endtime, playtime

def get_events(player):
    goals = assists = penalties = yc = rc =autogoals = 0
    events = player('svg')
    for event in events:
        kind = event.get('class')[0]
        try:
            if 'event-item--number' in event.parent.next_sibling.next_sibling.get('class'):
                n = int(event.parent.next_sibling.next_sibling.string.strip()[1:])
            else:
                n = 1
        except:
            n = 1
        if kind == 'goal':
            goals = n
        elif kind == 'assist':
            assists = n
        elif kind == 'penalty':
            penalties = n
        elif kind == 'yellow-card':
            yc = n
        elif kind == 'red-card':
            rc = n
        elif kind == 'autogoal':
            autogoals = n
    return goals, assists, penalties, yc, rc, autogoals

def get_playerinfo(player, main, bd_dict, players_dict):
    
    fullname = player.a.string.strip()
    id_player = get_idplayer(fullname, players_dict)
    
    firstname = fullname.split()[0]
    secondname = ' '.join(fullname.split()[1:])
    
    player_link = yfl + player.a.get('href')
    
    try:
        role = fullrole(player.find('span', class_='match-protocol__member-amplua').string.strip())
    except:
        role = np.nan
    try:
        number = int(player.find('span', class_='match-protocol__member-number').string.strip())
    except:
        number = np.nan
    
    starttime, endtime, playtime = get_playertime(player, main)
    
    goals, assists, penalties, yc, rc, autogoals = get_events(player)
    
    bd_date, bd_month = bd_dict.get(fullname, (np.nan, np.nan))
    
    return (role, number, fullname, id_player, firstname, secondname, starttime, endtime, playtime,
            goals, assists, yc, rc, penalties, autogoals, player_link, bd_date, bd_month, main)

def get_stinfo(st, players_dict):
    
    fullname = st.find('p', class_='match-protocol__staff-name').string.strip()
    id_st = get_idplayer(fullname, players_dict)
    
    secondname = fullname.split()[0]
    firstname = fullname.split()[1]
    
    role = st.find('p', class_='match-protocol__staff-position').string.strip()
    
    st_link = yfl + st.a.get('href')
    st_page = requests.get(st_link)
    st_soup = BeautifulSoup(st_page.text, 'html.parser')
    
    try:
        bd_date = st_soup.find('span', class_='staff-promo__value').string.strip()
        bd_month = monthchanger(bd_date.split()[1])
    except:
        bd_date = np.nan
        bd_month = np.nan
        
    return role, fullname, id_st, firstname, secondname, st_link, bd_date, bd_month

In [56]:
def parse(url, season, tournament, df_old=None):
    
    yfl = 'https://yflrussia.ru'
    page = requests.get(url + '/calendar')
    soup = BeautifulSoup(page.text, 'html.parser')
    
    if df_old is not None:
        df_old = df_old[df_old['турнир'] == tournament]
        
    df = pd.DataFrame(columns=['чемпионат', 'сезон', 'турнир', 'тур', 'месяц матча', 'матч', 'хозяева', 'хозяева голы',
                               'гости голы', 'гости', 'гости\\хозяева', 'результат', 'id_матча',
                               'id_результата', 'амплуа', 'номер игрока', 'фио', 'id_фио', 'имя', 'фамилия',
                               'минута выхода', 'минута ухода', 'минут на поле', 'гол', 'ассист','жк', 'кк',
                               'пенальти', 'автогол', 'ссылка игрока', 'дата рождения', 'месяц рождения',
                               'старт\\запас', 'команда', 'id_команды', 'CategoryColumn', 'судья'])
        
    championship = 'ЮФЛ'
    
    if df_old is not None:
        id_match = df_old['id_матча'].unique().shape[0] + 1
    else:
        id_match = 1
    
    teams_dict = {'COUNTER': 0}
    players_dict = {'COUNTER': 0}

    matches = soup('li', 'schedule__matches-item js-calendar-match js-calendar-last-match')

    for match in tqdm(matches):
        team1 = match.find('a', class_='schedule__team-1')
        team1_name = team1.span.string.strip()
        team2 = match.find('a', class_='schedule__team-2')
        team2_name = team2.span.string.strip()
        matchinfo_score = match.find('a', class_='schedule__score')
        score = matchinfo_score.find('div', class_='schedule__score-main').string.strip()
        
        match_name = team1_name + ' ' + score + ' ' + team2_name
        
        if df_old is not None and (tournament + match_name == df_old['турнир'] + df_old['матч']).any():
            continue
        
        n = 0
        df_match = pd.DataFrame(columns=['чемпионат', 'сезон', 'турнир', 'тур', 'месяц матча', 'матч', 'хозяева', 'хозяева голы',
                                         'гости голы', 'гости', 'гости\\хозяева', 'результат', 'id_матча',
                                         'id_результата', 'амплуа', 'номер игрока', 'фио', 'id_фио', 'имя', 'фамилия',
                                         'минута выхода', 'минута ухода', 'минут на поле', 'гол', 'ассист', 'жк', 'кк',
                                         'пенальти', 'автогол', 'ссылка игрока', 'дата рождения', 'месяц рождения',
                                         'старт\\запас', 'команда', 'id_команды', 'CategoryColumn', 'судья'])

        if tournament == 'ЮФЛ-1(19/20)':
            tour = (id_match - 1)//6 + 1
        else:
            tour = (id_match - 1)//7 + 1
        date = match.parent.previous_sibling.previous_sibling.span.string.strip()
        _, month, _, _ = splitdate(date)

        id_team1 = get_idteam(team1_name, teams_dict)
        team1_link = yfl + team1.get('href')
        team1_page = requests.get(team1_link)
        team1_soup = BeautifulSoup(team1_page.text, 'html.parser')
        bd1_dict = create_bddict(team1_soup)

        id_team2 = get_idteam(team2_name, teams_dict)
        team2_link = yfl + team2.get('href')
        team2_page = requests.get(team2_link)
        team2_soup = BeautifulSoup(team2_page.text, 'html.parser')
        bd2_dict = create_bddict(team2_soup)

        score1 = int(score.split()[0])
        score2 = int(score.split()[-1])
        result1, result2 = get_result(score1, score2)

        matchinfo_link = yfl + matchinfo_score.get('href')
        matchinfo_page = requests.get(matchinfo_link)
        matchinfo_soup = BeautifulSoup(matchinfo_page.text, 'html.parser')

        referee = matchinfo_soup.find('div', class_='match-protocol__referee')
        if referee:
            referee_name = referee('span')[0].string.strip() + ' ' + referee('span')[1].string.strip()
        else:
            referee_name = np.nan

        tp = matchinfo_score.find('div', class_='schedule__score-additional').string.strip()
        if tp == 'ТП':
            df_match.loc[0] = [championship, season, tournament, tour, month, match_name, team1_name, score1, score2, team2_name,
                               'Хозяева', result1, id_match, 2*id_match - 1] + ['ТП', np.nan, 'ТП', 0] + \
                                [np.nan] * 15 + [team1_name, id_team1, matchinfo_link, referee_name]
            df_match.loc[1] = [championship, season, tournament, tour, month, match_name, team1_name, score1, score2, team2_name,
                               'Гости', result2, id_match, 2*id_match] + ['ТП', np.nan, 'ТП', 0] + \
                                [np.nan] * 15 + [team2_name, id_team2, matchinfo_link, referee_name]
            df = df.append(df_match, ignore_index=True) 
            id_match += 1
            continue

        mainstuff = matchinfo_soup.find('div', class_='match-protocol__composition')
        reservestuff = mainstuff.find_next('div', class_='match-protocol__composition')
        for stuff in (mainstuff, reservestuff):
            main = 'Старт' if stuff is mainstuff else 'Запас'
            team1players = stuff('li', class_='match-protocol__member match-protocol__member--left')
            for player in team1players:

                role, number, fullname, id_player, firstname, secondname, starttime, endtime, playtime, goals, \
                assists, yc, rc, penalties, autogoals, player_link, bd_date, bd_month, main = \
                get_playerinfo(player, main, bd1_dict, players_dict)

                df_match.loc[n] = [championship, season, tournament, tour, month, match_name, team1_name, score1, score2, team2_name, 'Хозяева',
                                   result1, id_match, 2*id_match - 1, role, number, fullname, id_player, firstname,
                                   secondname, starttime, endtime, playtime, goals, assists, yc, rc, penalties, autogoals,
                                   player_link, bd_date, bd_month, main, team1_name, id_team1, matchinfo_link, referee_name]
                n += 1
            team2players = stuff('li', class_='match-protocol__member match-protocol__member--right')
            for player in team2players:

                role, number, fullname, id_player, firstname, secondname, starttime, endtime, playtime, goals, \
                assists, yc, rc, penalties, autogoals, player_link, bd_date, bd_month, main = \
                get_playerinfo(player, main, bd2_dict, players_dict)

                df_match.loc[n] = [championship, season, tournament, tour, month, match_name, team1_name, score1, score2, team2_name, 'Гости',
                                   result2, id_match, 2*id_match, role, number, fullname, id_player, firstname,
                                   secondname, starttime, endtime, playtime, goals, assists, yc, rc, penalties, autogoals,
                                   player_link, bd_date, bd_month, main, team2_name, id_team2, matchinfo_link, referee_name]
                n += 1

        staff = reservestuff.find_next('div', class_='match-protocol__composition')
        if staff is not None:
            team1staff = staff.find('ul', class_='match-protocol__team match-protocol__team--left')('li')
            for st in team1staff:

                if 'match-protocol-empty' in st.get('class') or 'match-protocol__staff--empty' in st.get('class'):
                    continue

                role, fullname, id_st, firstname, secondname, st_link, bd_date, bd_month = get_stinfo(st, players_dict)

                df_match.loc[n] = [championship, season, tournament, tour, month, match_name, team1_name, score1, score2, team2_name,
                                    'Хозяева', result1, id_match, 2*id_match - 1, role, np.nan, fullname, id_st,
                                    firstname, secondname] + [np.nan] * 9 + \
                                    [st_link, bd_date, bd_month, 'Штаб', team1_name, id_team1, matchinfo_link, referee_name]
                n += 1
            team2staff = staff.find('ul', class_='match-protocol__team match-protocol__team--right')('li')
            for st in team2staff:

                if 'match-protocol-empty' in st.get('class') or 'match-protocol__staff--empty' in st.get('class'):
                    continue

                role, fullname, id_st, firstname, secondname, st_link, bd_date, bd_month = get_stinfo(st, players_dict)

                df_match.loc[n] = [championship, season, tournament, tour, month, match_name, team1_name, score1, score2, team2_name,
                                    'Гости', result2, id_match, 2*id_match, role, np.nan, fullname, id_st,
                                    firstname, secondname] + [np.nan] * 9 + \
                                    [st_link, bd_date, bd_month, 'Штаб', team2_name, id_team2, matchinfo_link, referee_name]
                n += 1

        df = df.append(df_match, ignore_index=True)
        id_match += 1
        
    df['гол'] += df['пенальти']
    
    for _, player_auto in df[df['автогол'] >= 1].iterrows():
        autogoals = player_auto['автогол']
        id_result = player_auto['id_результата']
        if id_result % 2 == 0:
            id_result -= 1
        else:
            id_result += 1
        
        player_antiauto = df.loc[(df['id_результата'] == id_result) & (df['амплуа'] == 'Нападающий')].iloc[0].name
        df.at[player_antiauto, 'гол'] += 1
        
    return df

In [57]:
yfl = 'https://yflrussia.ru'
season_19_20 = '/tournament/1005599'
season_20_21_ufl1 = '/tournament/1011826'
season_20_21_ufl2 = '/tournament/1011827'

url0 = yfl + season_19_20
url1 = yfl + season_20_21_ufl1
url2 = yfl + season_20_21_ufl2

urls = [url0, url1, url2]
seasons = ['2019/2020', '2020/2021', '2020/2021']
tournaments = ['ЮФЛ-1(19/20)', 'ЮФЛ-1', 'ЮФЛ-2']

In [58]:
if 'yfl.xlsx' in os.listdir():
    df_old = pd.read_excel('yfl.xlsx').iloc[:, 1:]
    df = df_old.copy()
else:
    df_old = None
    df = pd.DataFrame(columns=['чемпионат', 'сезон', 'турнир', 'тур', 'месяц матча', 'матч', 'хозяева', 'хозяева голы',
                           'гости голы', 'гости', 'гости\\хозяева', 'результат', 'id_матча',
                           'id_результата', 'амплуа', 'номер игрока', 'фио', 'id_фио', 'имя', 'фамилия',
                           'минута выхода', 'минута ухода', 'минут на поле', 'гол', 'ассист',
                           'жк', 'кк', 'пенальти', 'автогол', 'ссылка игрока', 'дата рождения', 'месяц рождения',
                           'старт\\запас', 'команда', 'id_команды', 'CategoryColumn', 'судья'])
    
for url, season, tournament in zip(urls, seasons, tournaments):
    df = df.append(parse(url, season, tournament, df_old=df_old), ignore_index=True)

100%|█████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 857.63it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 91/91 [00:00<00:00, 919.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:30<00:00,  2.97it/s]


In [32]:
col = df['матч'] + df['турнир'] + df['тур'].astype(str)
df['id_матча'] = col.replace({x: id_x + 1 for id_x, x in enumerate(col.unique())})

col = df['id_матча'].astype(str) + df['команда']
df['id_результата'] = col.replace({x: id_x + 1 for id_x, x in enumerate(col.unique())})
    
df['id_фио'] = df['фио'].replace({name: id_name + 1 for id_name, name in enumerate(df['фио'].unique())})

col = df['команда'] + df['турнир']
df['id_команды'] = col.replace({x: id_x + 1 for id_x, x in enumerate(col.unique())})

In [33]:
df.to_excel('yfl.xlsx')

In [47]:
df['id_матча'].unique().shape[0]

265