In [None]:
import pandas as pd
import requests
import time
import numpy as np
import json
import concurrent.futures
import jsonlines


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
challenger_match_hists = pd.read_csv('../data/challenger_match_hists.csv')

In [None]:
region_base_url_dict = {'na':'na1.api.riotgames.com',
          'br':'br1.api.riotgames.com',
          'eun':'eun1.api.riotgames.com',
          'euw':'euw1.api.riotgames.com',
          'jp':'jp1.api.riotgames.com',
          'kr':'kr.api.riotgames.com',
          'la1':'la1.api.riotgames.com',
          'la2':'la2.api.riotgames.com',
          'oce':'oc1.api.riotgames.com',
          'tr':'tr1.api.riotgames.com',
          'ru':'ru.api.riotgames.com'}
            

In [None]:
api_key = pd.read_json('../secrets.json')['riot_api_key'][0]

timeline_by_match_id_url = '/lol/match/v4/timelines/by-match/'

match_by_match_id_url = '/lol/match/v4/matches/'

account_by_name_url = '/lol/summoner/v4/summoners/by-name/'

match_hist_by_id_url = '/lol/match/v4/matchlists/by-account/'

challenger_ladder_url = '/lol/league/v4/challengerleagues/by-queue/RANKED_SOLO_5x5'

summoner_by_summoner_id = '/lol/summoner/v4/summoners/'

In [None]:
champions = requests.get('http://ddragon.leagueoflegends.com/cdn/10.16.1/data/en_US/champion.json').json()

In [None]:
champions_df = pd.DataFrame(champions['data']).transpose()

In [None]:
#challenger_ladder = requests.get(f'https://na1.api.riotgames.com{challenger_ladder_url}?api_key={api_key}')

In [None]:
#challenger_ladder.json()

In [None]:
def get_all_challengers():
    challengers_df = pd.DataFrame()
    for key, base_url in region_base_url_dict.items():
        ladder_response = requests.get(f'https://{base_url}{challenger_ladder_url}?api_key={api_key}')
        try:
            assert(ladder_response.status_code == 200)
            response_df = pd.DataFrame(ladder_response.json()['entries'])
            response_df['region'] = [key] * len(response_df)
            challengers_df = pd.concat([challengers_df,response_df])
        except:
            print(f'Bad request for {key}: {ladder_response.status_code}')
        time.sleep(1.2001)
        
    return challengers_df

In [None]:
def get_summoner(base_url, summoner_id):
    return requests.get(f'https://{base_url}{summoner_by_summoner_id}{summoner_id}?api_key={api_key}')

In [None]:
def get_match_hist(account_id, base_url, queue = '420'):
    return requests.get(f'https://{base_url}{match_hist_by_id_url}{account_id}?api_key={api_key}&queue={queue}')

In [None]:
def get_match(match_id, base_url):
    return requests.get(f'https://{base_url}{match_by_match_id_url}{match_id}?api_key={api_key}'),requests.get(f'https://{base_url}{timeline_by_match_id_url}{match_id}?api_key={api_key}')

In [None]:
def get_challenger_match_hists(region, challengers_df = all_challengers, has_account_id=False):
    challengers_df = challengers_df[['summonerId','region']]
    challengers_df['player_ids'] = [np.nan] * len(challengers_df)
    challengers_df = challengers_df[challengers_df['region'] == region]
    challengers_df.reset_index(drop=True,inplace=True)
    
    if has_account_id==False:
        for i, tuples in enumerate(challengers_df.values):
            if i % 500 == 0:
                print(f'{i} of {len(challengers_df)} account ids')
            summoner_id, region, unused_pid = tuples
            summoner = get_summoner(region_base_url_dict[region],summoner_id).json()
            account_id = summoner['accountId']
            challengers_df.loc[i,'account_ids'] = account_id
            all_challengers.loc[i,'account_ids'] = account_id
            time.sleep(1.2)
        
    challenger_match_hists = pd.DataFrame()
    for i, account_id in enumerate(challengers_df['account_ids']):
        if i % 500 == 0:
            print(f'{i} of {len(challengers_df)} match histories')
        try:
            match_hist = pd.DataFrame(get_match_hist(account_id, region_base_url_dict[challengers_df.loc[i,'region']]).json()['matches'])
            match_hist['region'] = [challengers_df.loc[i,'region']] * len(match_hist)
            match_hist['account_id'] = [account_id] * len(match_hist)
            challenger_match_hists = pd.concat([challenger_match_hists, match_hist])
        except:
            pass
        time.sleep(1.2)
    
    return challenger_match_hists

In [None]:
def scrape_seeds(region):
    print(f'{region} thread initialized')
    start_time = time.time()
    base_url = region_base_url_dict[region]
    
    matches_list = []
    scraped_ids = set()
    timelines_list = []
    scraped_participants = set()
    unscraped_ids = set()
    scraped_matches = []
    
    with jsonlines.open(f'../data/scraped_ids_{region}.jsonl') as infile:
        print(f'reading scraped_ids_{region}')
        for line in infile.iter():
            scraped_ids.add(line)       
    
    with jsonlines.open(f'../data/unscraped_ids_{region}.jsonl') as infile:
            print(f'reading unscraped_ids_{region}')
            for line in infile.iter():
                unscraped_ids.add(line)
                
    with jsonlines.open(f'../data/matches_{region}.jsonl') as infile:
            print(f'reading matches_{region}')
            for line in infile.iter():
                scraped_matches.append(dict(line)['gameId'])     
    
    seed = pd.read_csv('../data/challenger_match_hists_ranked_only.csv')
    
    seed = seed[seed['region']==region]
    
    seed.reset_index(drop=True,inplace=True)
    
    for i, account_id in enumerate(seed['account_id']):
        if account_id not in scraped_ids or seed.loc[i,'gameId'] not in scraped_matches:
            scraped_ids.add(account_id)
            match, timeline = get_match(seed.loc[i,'gameId'], base_url)
        
            if match.status_code == 200 and timeline.status_code == 200:
                matches_list.append(match.json())
                timelines_list.append(timeline.json())
                [scraped_participants.add(part['player']['accountId']) for part in match.json()['participantIdentities']]
                scraped_matches.append(seed.loc[i,'gameId'])
            else:
                print(f'matches error: {match.status_code}\ntimelines error: {timeline.status_code}\nSummoner: {account_id}\nRegion:{region}')
            time.sleep(2.4)
        
        else:
            pass
        
        if 'match' in locals():
            if i % 100 == 0 or match.status_code == 403:
                print(f'{i} matches scraped of {len(seed)}\nRegion: {region}\n')
                try:
                    with open(f'../data/matches_{region}.jsonl', 'a') as outfile:
                        for entry in matches_list:
                            json.dump(entry, outfile)
                            outfile.write('\n')
                            matches_list = []

                    with open(f'../data/timelines_{region}.jsonl', 'a') as outfile:
                        for entry in timelines_list:
                            json.dump(entry, outfile)
                            outfile.write('\n')
                            timelines_list = []

                    with open(f'../data/scraped_ids_{region}.jsonl', 'w') as outfile:
                        for entry in scraped_ids:
                            json.dump(entry, outfile)
                            outfile.write('\n')

                    unscraped_ids = unscraped_ids.union(scraped_participants)
                    unscraped_ids -= scraped_ids

                    with open(f'../data/unscraped_ids_{region}.jsonl', 'w') as outfile:
                        for entry in unscraped_ids:
                            json.dump(entry, outfile)
                            outfile.write('\n')
                except:
                    pass

                if match.status_code == 403:
                    return None
            
    with open(f'../data/matches_{region}.jsonl', 'a') as outfile:
        for entry in matches_list:
            json.dump(entry, outfile)
            outfile.write('\n')

    with open(f'../data/timelines_{region}.jsonl', 'a') as outfile:
        for entry in timelines_list:
            json.dump(entry, outfile)
            outfile.write('\n')

    with open(f'../data/scraped_ids_{region}.jsonl', 'a') as outfile:
        for entry in scraped_ids:
            json.dump(entry, outfile)
            outfile.write('\n')

    print(f'Seeds scraped, region = {region}')

    unscraped_ids = unscraped_ids.union(scraped_participants)
    unscraped_ids -= scraped_ids
    
    with open(f'../data/unscraped_ids_{region}.jsonl', 'w') as outfile:
        for entry in unscraped_ids:
            json.dump(entry, outfile)
            outfile.write('\n')
    

In [None]:
def remove_duplicates(region):
    match_list = []
    timeline_list = []
    
    with jsonlines.open(f'../data/matches_{region}.jsonl') as infile:
                for line in infile.iter():
                    match_list.append(line) 
                    
    with jsonlines.open(f'../data/timelines_{region}.jsonl') as infile:
                for line in infile.iter():
                    timeline_list.append(line) 
                    
    ranked_matches = []
    ranked_timelines = []
    is_ranked = []
    game_ids = set()
    
    for i, match in enumerate(match_list):
        if match['queueId'] == 420 and match['gameId'] not in game_ids:
            ranked_matches.append(match)
            is_ranked.append(i)
            game_ids.add(match['gameId'])

    for i, timeline in enumerate(timeline_list):
        if i in is_ranked:
            ranked_timelines.append(timeline)
            
    with open(f'../data/cleaned/matches_{region}.jsonl', 'w') as outfile:
        for entry in ranked_matches:
            json.dump(entry, outfile)
            outfile.write('\n')

    with open(f'../data/cleaned/timelines_{region}.jsonl', 'w') as outfile:
        for entry in ranked_timelines:
            json.dump(entry, outfile)
            outfile.write('\n')   

In [None]:
def remove_short_games(region):
    match_list = []
    timeline_list = []
    
    with jsonlines.open(f'../data/cleaned/matches_{region}.jsonl') as infile:
                for line in infile.iter():
                    match_list.append(dict(line)['gameId']) 
                    
    with jsonlines.open(f'../data/cleaned/timelines_{region}.jsonl') as infile:
                for line in infile.iter():
                    timeline_list.append(dict(line)['gameId']) 
                    
    good_matches = []
    good_timelines = []
    valid = []
    game_ids = []
    
    for i, match in enumerate(match_list):
        if match['gameDuration'] / 60 > 15 and match['gameId'] not in game_ids:
            add = True
            # Remove games where a player didn't reach level 6 (based on EDA)
            for participant in match['participants']:
                if participant['stats']['champLevel'] < 7:
                    add = False
            if add:
                good_matches.append(match)
                valid.append(i)
                game_ids.add(match['gameId'])

    for i, timeline in enumerate(timeline_list):
        if i in valid:
            good_timelines.append(timeline)
            
    with open(f'../data/cleaned/matches_{region}.jsonl', 'w') as outfile:
        for entry in good_matches:
            json.dump(entry, outfile)
            outfile.write('\n')

    with open(f'../data/cleaned/timelines_{region}.jsonl', 'w') as outfile:
        for entry in good_timelines:
            json.dump(entry, outfile)
            outfile.write('\n')

In [None]:
#  import threading

#  for region in region_base_url_dict.keys():
#      some_thread = threading.Thread(target = scrape_seeds, args=[region])
#      some_thread.start()

In [None]:
scraped_matches = []
scraped_ids = set()

with jsonlines.open(f'../data/scraped_ids_na.jsonl') as infile:
        print(f'reading scraped_ids_na')
        for line in infile.iter():
            scraped_ids.add(line)

with jsonlines.open(f'../data/matches_na.jsonl') as infile:
            print(f'reading matches_na')
            for line in infile.iter():
                scraped_matches.append(dict(line)['gameId'])
                
na_hists = challenger_match_hists[challenger_match_hists['region'] == 'na']

for i in range(1,len(na_hists)):
    print(na_hists.loc[i,'gameId'] not in scraped_matches or na_hists.loc[i,'account_id'] not in scraped_ids)

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers = 11) as executor:
    executor.map(scrape_seeds, region_base_url_dict.keys())

In [None]:
all_challengers = get_all_challengers()

In [None]:
all_challengers.reset_index(drop=True,inplace=True)

In [None]:
Spawwwwn = get_summoner(region_base_url_dict[all_challengers['region'][0]], all_challengers['summonerId'][0])

In [None]:
Spawwwwn.json()

In [None]:
all_challengers.head()

In [None]:
all_challengers.loc[0,['summonerId','region']]

In [None]:
test = get_match_hist(Spawwwwn.json()['accountId'],region_base_url_dict[all_challengers['region'][0]])

In [None]:
test.json().keys()

In [None]:
#pd.DataFrame(test.json()['matches'])

In [None]:
%%time
#challenger_match_hists = get_challenger_match_hists(all_challengers)

match_hist_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers = 11) as executor:
    for result in executor.map(get_challenger_match_hists, region_base_url_dict.keys()):
        match_hist_list.append(result)

In [None]:
challenger_match_hists = pd.concat(match_hist_list)

In [None]:
challenger_match_hists.reset_index(drop=True,inplace=True)

In [None]:
challenger_match_hists.to_csv('../data/challenger_match_hists_ranked_only.csv', index=False)

In [None]:
for region in region_base_url_dict.keys():
    remove_duplicates(region)

In [None]:
for region in region_base_url_dict.keys():
    remove_short_games(region)