# Scrape All Rikishi Data

In [1]:
import urllib2
import bs4 #this is beautiful soup
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options
import time

# custom helper functions
from rikishi_scrape import rikishi_scrape

In [2]:
# PARENT URL CONTAINING TABLE OF RIKISHI

# all yokozuna
# db_url = "http://sumodb.sumogames.de/Rikishi.aspx?shikona=&heya=-1&shusshin=-1&b=-1&high=1&hd=-1&entry=-1&intai=-1&sort=4"

# three current yokozuna 
# db_url = "http://sumodb.sumogames.de/Rikishi.aspx?shikona=&heya=-1&shusshin=-1&b=201701&high=1&hd=-1&entry=-1&intai=-1&sort=4"

# all makushita
# db_url = "http://sumodb.sumogames.de/Rikishi.aspx?shikona=&heya=-1&shusshin=-1&b=-1&high=7&hd=-1&entry=-1&intai=-1&sort=4"

# all rikishi
db_url = "http://sumodb.sumogames.de/Rikishi.aspx?shikona=&heya=-1&shusshin=-1&b=-1&high=-1&hd=-1&entry=-1&intai=-1&sort=7"
source = urllib2.urlopen(db_url).read()
soup = bs4.BeautifulSoup(source, 'lxml') # turn into soup

print 'done'

done


## Obtain URL for Each Row's Rikishi

In [3]:
right_td = soup.findAll('td', {'class': 'layoutright'})
rows = right_td[0].findAll('tr') # take first td with layoutright class

# initialize pandas dataframe
cols = [u'shikona', u'highest_rank', u'bday', u'age', u'birth_place', u'height', u'weight', u'sumo_stable', \
        u'active_years', u'debut', u'entry_rank', u'retirement', u'career_record', u'makuuchi_record', \
        u'yokozuna_record', u'ozeki_record', u'sekiwake_record', u'komusubi_record', u'maegashira_record', \
        u'juryo_record', u'makushita_record', u'sandanme_record', u'jonidan_record', u'jonokuchi_record']
rikishi_df = pd.DataFrame(columns=cols) # initialize empty dataframe

# start loop here, iterate over each row and collect data
start = time.time() #

for ind, row in enumerate(rows):
    if ind: # skip first headers row
        links = row.findAll('a', href=True)
        extension = links[0]['href'] # access link inside href
        rikishi_url = "http://sumodb.sumogames.de/" + extension
        compiled_stats = rikishi_scrape(rikishi_url)
        rikishi_df.loc[len(rikishi_df.index)] = compiled_stats
        print "Iter: %d, " % ind + compiled_stats[0]

end = time.time()
exec_time = end - start
print "SCRAPING DONE"
print "Execution Time: %.2f (s)" % exec_time

rikishi_df.to_pickle('all_yokozuna.pkl')

Iter: 1, Akashi Shiganosuke
Iter: 2, Ayagawa Goroji
Iter: 3, Maruyama Gondazaemon
Iter: 4, Nishinoumi Kajiro
Iter: 5, Konishiki Yasokichi
Iter: 6, Ozutsu Manemon
Iter: 7, Onishiyama - Hitachiyama Taniemon
Iter: 8, Tachiyama Mineemon - Tachiyama Totaro - Tachiyama Mineemon
Iter: 9, Umenotani Otomatsu - Umegatani Totaro - Umegatani Mineemon - Umegatani Totaro
Iter: 10, Okido Moriemon
Iter: 11, Tanegashima# - Hoshikabuto - Nishikinada Yosaburo - Nishinoumi Nadaemon - Nishinoumi Kajiro
Iter: 12, Onishiki Daigoro
Iter: 13, Otori - Otori Tanigoro
Iter: 14, Genjiyama Isesuke - Genjiyama Daigoro - Nishinoumi Kajiro
Iter: 15, Onishiki Uichiro
Iter: 16, Tochigiyama Senjo - Tochigiyama Moriya
Iter: 17, Iwategawa - Miyagiyama - Miyagiyama Fukumatsu
Iter: 18, Tsunenohana Kanichi
Iter: 19, Minanogawa Kyojiro - Asashio Kyojiro - Minanogawa Tozo
Iter: 20, Tamanishiki Sanemon
Iter: 21, Musashiyama Takeshi
Iter: 22, Futabayama Sadakichi - Futabayama Sadaji - Futabayama Sadabee - Futabayama Sadaji
Iter: 

## Validate Data Scraping

In [5]:
rikishi_df.head()

# rikishi_df[rikishi_df['active_years'] > 30.]

# rikishi_df.to_pickle('yokozuna_test4.pkl')


Unnamed: 0,shikona,highest_rank,bday,age,birth_place,height,weight,sumo_stable,active_years,debut,entry_rank,retirement,career_record,makuuchi_record,yokozuna_record,ozeki_record,sekiwake_record,komusubi_record,maegashira_record,juryo_record,makushita_record,sandanme_record,jonidan_record,jonokuchi_record
0,Akashi Shiganosuke,Yokozuna,,,"Tochigi-ken, Utsunomiya-shi",,,,,,,,"{u'appearances': 0, u'jun_yusho': None, u'with...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w..."
1,Ayagawa Goroji,Yokozuna,,,Tochigi-ken,200.0,150.0,,,,,,"{u'appearances': 0, u'jun_yusho': None, u'with...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w..."
2,Maruyama Gondazaemon,Yokozuna,1713-12-23 00:00:00,,Miyagi-ken,197.0,166.0,Nanatsumori,,,,,"{u'appearances': 0, u'jun_yusho': None, u'with...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w..."
3,Nishinoumi Kajiro,Yokozuna,1855-01-03 00:00:00,40.9938,"Kagoshima-ken, Sendai-shi",176.0,127.0,Takasago,13.9986,1882-01-01 00:00:00,Maegashira,1896-01-01 00:00:00,"{u'appearances': 193, u'jun_yusho': None, u'wi...","{u'appearances': 193, u'jun_yusho': None, u'wi...","{u'appearances': 60, u'jun_yusho': None, u'wit...","{u'appearances': 26, u'jun_yusho': None, u'wit...","{u'appearances': 60, u'jun_yusho': None, u'wit...","{u'appearances': 23, u'jun_yusho': None, u'wit...","{u'appearances': 24, u'jun_yusho': None, u'wit...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w..."
4,Konishiki Yasokichi,Yokozuna,1866-10-15 00:00:00,34.2122,"Chiba-ken, Sanbu-gun",168.0,130.0,Takasago,17.6701,1883-05-01 00:00:00,Jonokuchi,1901-01-01 00:00:00,"{u'appearances': 169, u'jun_yusho': None, u'wi...","{u'appearances': 159, u'jun_yusho': None, u'wi...","{u'appearances': 63, u'jun_yusho': None, u'wit...","{u'appearances': 61, u'jun_yusho': None, u'wit...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': 17, u'jun_yusho': None, u'wit...","{u'appearances': 18, u'jun_yusho': None, u'wit...","{u'appearances': 10, u'jun_yusho': None, u'wit...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w...","{u'appearances': None, u'jun_yusho': None, u'w..."


In [6]:
yokozuna = rikishi_df.loc[56]
yokozuna['sandanme_record']

{u'appearances': 42,
 u'gino_sho': None,
 u'jun_yusho': None,
 u'kanto_sho': None,
 u'kinboshi': None,
 u'losses': 16,
 u'shukun_sho': None,
 u'tourneys': 6,
 u'wins': 26,
 u'withdrawals': None,
 u'yusho': None}