In [None]:
import numpy as np
import pandas as pd
import copy
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from itertools import chain

import requests
base_url_fir = "https://sports.core.api.espn.com"
base_url_sec = 'https://site.web.api.espn.com'
queries_for_ids = {
    "athletes": base_url_fir + "/v2/sports/football/leagues/nfl/athletes?limit=1000&active=true",
    "teams": base_url_fir + "/v2/sports/football/leagues/nfl/teams?limit=32",
    "positions": base_url_fir +"/v2/sports/football/leagues/nfl/positions?limit=75",
    "venues": base_url_fir +"/v2/sports/football/leagues/nfl/venues?limit=700",
}

In [None]:
def get_full_response_list(base):
    response = requests.get(base).json()
    response_list = []

    if 'pageIndex' in response.keys():
        for i in range(response['pageCount']):
            response_list.append(requests.get(base + f'&page={i + 1}').json()['items'])
        response_list = [x['$ref'] for x in list(chain.from_iterable(response_list))]
        # response_list_cl = list(chain.from_iterable(response_list))
    else:
        response_list = [x['$ref'] for x in response['items']]
        # response_list_cl = response['items']


    return response_list


def del_cell_ref_field(cell):
    cell = copy.deepcopy(cell)
    if (isinstance(cell, dict)) and ('$ref' in cell.keys()):
        del cell['$ref']
        return cell
    return cell


def request_table_data(df_name, source_column, target_column):
    df_name[target_column] = df_name[source_column].apply(lambda x: x if pd.isnull(x) else del_cell_ref_field(requests.get(x).json()))


def process_link_columns(df_name, source_column_list):
    for i in source_column_list:
        request_table_data(df_name, i, 'loaded_from_' + i)
    df_name.drop(columns=source_column_list, inplace=True)


def check_dict_link(cell):
    if isinstance(cell, dict):
        return (cell == requests.get(cell['$ref']).json())
    return False


def unwrap_data(df, column_name):

    if df[column_name].apply(lambda x: isinstance(x, list)).any():

            columns_list = [f'col_{x}_from_{column_name}' for x in range(df[column_name].apply(lambda x: len(x) if isinstance(x, list) else 0).max())]
            sup_df = pd.DataFrame(columns=columns_list, index=range(df.shape[0]))
            for row, field in enumerate(df[column_name]):
                if not isinstance(field, list):
                    field = [field]
                for num, block in enumerate(field):
                    sup_df.at[row, f'col_{num}_from_{column_name}'] = block


    elif df[column_name].notna().any():

        if (df[column_name].apply(lambda x: ((isinstance(x, dict))) and ('$ref' in x.keys())).any()) and (df[column_name].apply(lambda x: check_dict_link(x)).any()):
            df[column_name] = df[column_name].apply(lambda x: del_cell_ref_field(x))

        col_names = set()

        for i in df[column_name]:
            if isinstance(i, dict):
                col_names = col_names.union(i.keys())

        col_names = list(col_names)


        sup_df = pd.DataFrame(columns=[(x + '_from_' + column_name) for x in col_names],
                              index=range(df.shape[0]))
        for i, j in enumerate(df[column_name]):
            if not isinstance(j, dict) or pd.isnull(j):
                for keys in col_names:
                    sup_df.at[i, keys + '_from_' + column_name] = j
            else:
                for keys, values in j.items():
                    sup_df.at[i, keys + '_from_' + column_name] = values

    else:
        return pd.DataFrame()
    sup_df.index = df.index
    return sup_df


def unwrap_list_columns(df_name, source_column_list):
    unwrapped_list = []
    for i in source_column_list:
        df_name = pd.concat([df_name, unwrap_data(df_name, i)], axis = 1)
        unwrapped_list.append(i)
    df_name.drop(columns=source_column_list, inplace=True)
    if not [x for x in source_column_list if x not in unwrapped_list]:
        print('Unwrapped successful')
    return df_name


def get_parent_position_id(base):
    if pd.isna(base):
        return np.nan
    result = requests.get(base).json()
    if ('id' in result.keys()):
        return result['id']
    return np.nan


# Функция для получения ID атлета
def get_athlete_id(base):
    if pd.isna(base):
        return np.nan
    answer = requests.get(base).json()
    if ('id' in answer.keys()):
        return answer['id']
    return np.nan


# Функция для получения года и имени сезона для идентификации команды
def get_season_start_and_team_id(base):
    if pd.isna(base):
        return np.nan, np.nan
    answer = requests.get(base).json()
    keys = answer.keys()
    if 'id' in keys:
        team_id = answer['id']
    else:
        team_id = np.nan
    if 'groups' not in keys:
        return team_id, np.nan
    if len(answer['groups'].keys()) != 1:
        return team_id, np.nan
    result = answer['groups']['$ref']
    result = requests.get(result).json()['season']['$ref']
    result = requests.get(result).json()
    return team_id, result['startDate']


def get_season_start(base):
    if pd.isna(base):
        return np.nan
    result = requests.get(base).json()
    if ('startDate' in result.keys()):
        return result['startDate']
    return np.nan

In [None]:
athletes_df = pd.DataFrame(get_full_response_list('http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/athletes?lang=en&region=us&limit=1000'),
                           columns=['$ref_on_athletes'])

In [None]:
chunks_list = []
chunks_number = 250
for i in range(chunks_number):
    chunks_list.append(athletes_df.iloc[range((athletes_df.shape[0] // chunks_number) * i,
                                              (athletes_df.shape[0] // chunks_number) * (i + 1)), :].copy())
chunks_list.append(athletes_df.iloc[range((athletes_df.shape[0] // chunks_number) * chunks_number, athletes_df.shape[0]), :].copy())

In [None]:
def completely_preprocess_data(chunk):
    process_link_columns(chunk, ['$ref_on_athletes'])
    chunk = unwrap_list_columns(chunk, ['loaded_from_$ref_on_athletes'])

    while [x for x in chunk.columns if (chunk[x].apply(lambda x: isinstance(x, (dict, list)))).any()]:
        chunk = unwrap_list_columns(chunk,
                                 [x for x in chunk.columns if (chunk[x].apply(lambda x: isinstance(x, (dict, list)))).any()])

    for i in [x for x in chunk.columns if x.startswith('$ref_from_team')]:
        values_tuple = chunk[i].apply(get_season_start_and_team_id)
        chunk['team_id_from_' + i] = [x[0] for x in values_tuple]
        chunk['season_start_from_' + i] = [x[1] for x in values_tuple]
    chunk.drop(columns=[x for x in chunk.columns if x.startswith('$ref_from_team')], inplace=True)

    process_link_columns(chunk, [x for x in chunk.columns if x.startswith('$ref')])

    while [x for x in chunk.columns if (chunk[x].apply(lambda x: isinstance(x, (dict, list)))).any()]:
        chunk = unwrap_list_columns(chunk,
                                 [x for x in chunk.columns if (chunk[x].apply(lambda x: isinstance(x, (dict, list)))).any()])

    for i in [x for x in chunk.columns if x.startswith('$ref_from_team')]:
        values_tuple = chunk[i].apply(get_season_start_and_team_id)
        chunk['team_id_from_' + i] = [x[0] for x in values_tuple]
        chunk['season_start_from_' + i] = [x[1] for x in values_tuple]
    chunk.drop(columns=[x for x in chunk.columns if x.startswith('$ref_from_team')], inplace=True)

    for i in [x for x in chunk.columns if (x.startswith('$ref_from_athlete') or (x.startswith('$ref_from_proAthlete')))]:
        chunk['athlete_id_from_' + i] = chunk[i].apply(get_athlete_id)
    chunk.drop(columns=[x for x in chunk.columns if (x.startswith('$ref_from_athlete') or (x.startswith('$ref_from_proAthlete')))], inplace=True)

    for i in [x for x in chunk.columns if x.startswith('$ref_from_season')]:
        chunk['season_start_from_' + i] = chunk[i].apply(get_season_start)
    chunk.drop(columns=[x for x in chunk.columns if x.startswith('$ref_from_season')], inplace=True)

    for i in [x for x in chunk.columns if x.startswith('$ref_from_parent')]:
        chunk['parent_position_id_from_' + i] = chunk[i].apply(get_parent_position_id)
    chunk.drop(columns=[x for x in chunk.columns if x.startswith('$ref_from_parent')], inplace=True)

    process_link_columns(chunk, [x for x in chunk.columns if x.startswith('$ref')])

    while [x for x in chunk.columns if (chunk[x].apply(lambda x: isinstance(x, (dict, list)))).any()]:
        chunk = unwrap_list_columns(chunk,
                                 [x for x in chunk.columns if (chunk[x].apply(lambda x: isinstance(x, (dict, list)))).any()])
    return chunk

In [None]:
import multiprocessing
multiprocessing.cpu_count()

2

In [None]:
(pd.read_csv('/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/Probe_210.csv')).shape

  (pd.read_csv('/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/Probe_210.csv')).shape


(71, 35794)

In [None]:
def multy_proc_preprocess(chunk_num):
    res_df = completely_preprocess_data(chunks_list[chunk_num])
    print(f'Chunk {chunk_num} processed')
    # saved_df = pd.read_csv(f'/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/Probe{chunk_num}.csv')
    # df_to_upload = pd.concat([saved_df, res_df], axis=0)
    res_df.to_csv(f'/content/drive/MyDrive/BidData/data/Athletes_Chunks_Only/Probe_{chunk_num}.csv')
    print('Probe file uploaded')

In [None]:
# #13,
# for i in range(21, 30, 2):
#     chunks_num_list = [i, i+1]
#     processes = [multiprocessing.Process(target=multy_proc_preprocess, args=[num]) for num in chunks_num_list]

#     for process in processes:
#         process.start()

#     # дожидаемся выполнение
#     for process in processes:
#         process.join()

In [None]:
import os
content = os.listdir('/content/drive/MyDrive/BidData/data/Athletes_Chunks_Only')
content = [int(x.split('_')[1].split('.')[0]) for x in content]

In [None]:
upload_chunks = [x for x in range(11, 81) if x not in content]

In [None]:
len(upload_chunks)//2

1

In [None]:
for i in range(0, len(upload_chunks)//2, 2):
    # chunks_num_list = [upload_chunks[i], upload_chunks[i+1]]
    # print(f'Process chunk {upload_chunks[i]} and chunk {upload_chunks[i+1]}')
    # processes = [multiprocessing.Process(target=multy_proc_preprocess, args=[num]) for num in chunks_num_list]

    chunks_num_list = [45]
    # print(f'Process chunk {upload_chunks[i]} and chunk {upload_chunks[i+1]}')
    print(f'Process chunk {45}')
    processes = [multiprocessing.Process(target=multy_proc_preprocess, args=[num]) for num in chunks_num_list]

    for process in processes:
        process.start()

    # дожидаемся выполнение
    for process in processes:
        process.join()

Process chunk 45
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Unwrapped successful
Chunk 45 processed
Probe file uploaded


In [None]:
processes = [multiprocessing.Process(target=multy_proc_preprocess, args=[num]) for num in chunks_num_list]

In [None]:
for process in processes:
    process.start()

# дожидаемся выполнение
for process in processes:
    process.join()

In [None]:
# is_success = completely_preprocess_data(chunks_list[800])

In [None]:
# empty_pd = pd.DataFrame()

In [None]:
# empty_pd.to_csv('/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/GeneralFile.csv')

In [None]:
# for i in range(3):
#     res_df = completely_preprocess_data(chunks_list[i])
#     print(f'Chunk {i} processed')
#     saved_df = pd.read_csv('/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/GeneralFile.csv')
#     df_to_upload = pd.concat([saved_df, res_df], axis=0)
#     df_to_upload.to_csv('/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/GeneralFile.csv')
#     print('General file updated')

In [None]:
saved_df = pd.read_csv('/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/GeneralFile.csv')

  saved_df = pd.read_csv('/content/drive/MyDrive/BidData/data/ChunkProcessAttempt/GeneralFile.csv')


In [None]:
saved_df.shape

(213, 54042)