# Scrape One Rikishi Data

In [2]:
import urllib2
import bs4 #this is beautiful soup
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import re

# custom helper functions
from data_extraction import extract_rank, extract_years, extract_birthplace, \
                            extract_body_specs, extract_stable, extract_record

In [3]:
rikishi = "http://sumodb.sumogames.de/Rikishi.aspx?r=1123" # hakuho
# rikishi = "http://sumodb.sumogames.de/Rikishi.aspx?r=11893" # Akashi (NaN height weight?)
# rikishi = "http://sumodb.sumogames.de/Rikishi.aspx?shikona=kisenosato"
# rikishi = "http://sumodb.sumogames.de/Rikishi.aspx?r=3565"
# rikishi = "http://sumodb.sumogames.de/Rikishi.aspx?r=9280" # Genkai (unknown intai)
# rikishi = "http://sumodb.sumogames.de/Rikishi.aspx?r=3067" # uzugafuchi (long career record test)

source = urllib2.urlopen(rikishi).read()
# print source

soup = bs4.BeautifulSoup(source, 'lxml') # turn into soup
print 'done'

done


## Parse HTML Table with Rikishi Data

In [4]:
# isolate html tags to those containing rikishi data (two tags exist)
tables = soup.findAll('table', {'class': 'rikishidata'})
    
rows = tables[1].findAll('tr') # there are two tables with the same class, pick #2

# initialize desired cell values
# capture raw strings with japanese variables, but store in english for convenience
shikona = u''
rank = u''
birth_date = u''
shusshin = u''
ht_wt = u''
heya = u''
hatsu_dohyo = u''
intai = u''

career_record = u''
makuuchi_record = u''
yokozuna_record = u''
ozeki_record = u''
sekiwake_record = u''
komusubi_record = u''
maegashira_record = u''
juryo_record = u''
makushita_record = u''
sandanme_record = u''
jonidan_record = u''
jonokuchi_record = u''

# flag = 0 debugging purposes
for row in rows: # iterate through rows
    cells = row.findAll('td')
    
    if (cells[0].text == u'Shikona') & (len(cells) == 2):  # sumo name
        shikona = cells[1].text
    if (cells[0].text == u'Highest Rank') & (len(cells)==2):
        rank = cells[1].text
    if (cells[0].text == u'Birth Date') & (len(cells) == 2):
        birth_date = cells[1].text
    if (cells[0].text == u'Shusshin') & (len(cells) == 2): # place of origin
        shusshin = cells[1].text
    if (cells[0].text == u'Height and Weight') & (len(cells) == 2):
        ht_wt = cells[1].text
    if (cells[0].text == u'Heya') & (len(cells) == 2):
        heya = cells[1].text
    if (cells[0].text == u'Hatsu Dohyo') & (len(cells) == 2): # career start date
        hatsu_dohyo = cells[1].text
    if (cells[0].text == u'Intai') & (len(cells) == 2): # retirement date
        intai = cells[1].text
        
    if (cells[0].text == u'Career Record') & (len(cells) == 2):
        career_record = cells[1].text
    if (cells[0].text == u'\xa0 In Makuuchi') & (len(cells) == 2): # &nbsp; converted to space
        makuuchi_record = cells[1].text
    if (cells[0].text == u'\xa0\xa0 As Yokozuna') & (len(cells) == 2):
        yokozuna_record = cells[1].text
    if (cells[0].text == u'\xa0\xa0 As Ozeki') & (len(cells) == 2):
        ozeki_record = cells[1].text
    if (cells[0].text == u'\xa0\xa0 As Sekiwake') & (len(cells) == 2):
        sekiwake_record = cells[1].text
    if (cells[0].text == u'\xa0\xa0 As Komusubi') & (len(cells) == 2):
        komusubi_record = cells[1].text
    if (cells[0].text == u'\xa0\xa0 As Maegashira') & (len(cells) == 2):
        maegashira_record = cells[1].text
    if (cells[0].text == u'\xa0 In Juryo') & (len(cells) == 2):
        juryo_record = cells[1].text
    if (cells[0].text == u'\xa0 In Makushita') & (len(cells) == 2):
        makushita_record = cells[1].text
    if (cells[0].text == u'\xa0 In Sandanme') & (len(cells) == 2):
        sandanme_record = cells[1].text
    if (cells[0].text == u'\xa0 In Jonidan') & (len(cells) == 2):
        jonidan_record = cells[1].text
    if (cells[0].text == u'\xa0 In Jonokuchi') & (len(cells) == 2):
        jonokuchi_record = cells[1].text

print "---Raw Extracted Strings---"
print "Shikona: " + shikona
print "Highest Rank: " + rank
print "Date of Birth: " + birth_date
print "Shusshin: " + shusshin
print "Ht/Wt: " + ht_wt # height (cm), weight (kg)
print "Heya: " + heya
print "Hatsu Dohyo: " + hatsu_dohyo
print "Intai: " + intai
print "Career Record: " + career_record # wins, losses, withdrawals, appearances, tourneys
print "Makuuchi_Record: " + makuuchi_record # same as career record, but + possible awards

---Raw Extracted Strings---
Shikona: Hakuho Sho
Highest Rank: Yokozuna (May 2007)
Date of Birth: March 11, 1985 (31 years)
Shusshin: Mongolia, Ulan-Bator
Ht/Wt: 192 cm 152.9 kg
Heya: Miyagino
Hatsu Dohyo: 2001.03
Intai: 
Career Record: 1019-215-48/1232 (95 basho)
Makuuchi_Record: 925-167-48/1090 (76 basho), 37 Yusho, 21 Jun-Yusho, 2 Gino-Sho, 3 Shukun-Sho, 1 Kanto-Sho, 1 Kinboshi


## Prepare Data For Storing

In [5]:
highest_rank = extract_rank(rank)
bday, age, active_years, debut, entry_rank, retirement = extract_years(birth_date, hatsu_dohyo, intai)
birth_place = extract_birthplace(shusshin)
height, weight = extract_body_specs(ht_wt)
sumo_stable = extract_stable(heya)

career_record_dict = extract_record(career_record)
makuuchi_record_dict = extract_record(makuuchi_record)
yokozuna_record_dict = extract_record(yokozuna_record)
ozeki_record_dict = extract_record(ozeki_record)
sekiwake_record_dict = extract_record(sekiwake_record)
komusubi_record_dict = extract_record(komusubi_record)
maegashira_record_dict = extract_record(maegashira_record)
juryo_record_dict = extract_record(juryo_record)
makushita_record_dict = extract_record(makushita_record)
sandanme_record_dict = extract_record(sandanme_record)
jonidan_record_dict = extract_record(jonidan_record)
jonokuchi_record_dict = extract_record(jonokuchi_record)

print "Shikona: " + shikona
print '\n'

print "--Details--"
print "Rank: " + str(highest_rank)
print "Birthday: " + str(bday)
print "Age: " + str(age)
print "Birth Place: " + str(birth_place)
print "Height: " + str(height)
print "Weight: " + str(weight)
print "Sumo Stable: " + str(sumo_stable)
print "Active Years: " + str(active_years)
print "Debut: " + str(debut)
print "Entry Rank: " + str(entry_rank)
print "Retirement: " + str(retirement)
print '\n'

print "--Career Record--"
print "Wins: " + str(career_record_dict['wins'])
print "Losses: " + str(career_record_dict['losses'])
print "Withdrawals: " + str(career_record_dict['withdrawals'])
print "Appearances: " + str(career_record_dict['appearances'])
print "Tourneys: " + str(career_record_dict['tourneys'])
print '\n'

print "--Makuuchi Recod--"
print "Wins: " + str(makuuchi_record_dict['wins'])
print "Losses: " + str(makuuchi_record_dict['losses'])
print "Withdrawals: " + str(makuuchi_record_dict['withdrawals'])
print "Appearances: " + str(makuuchi_record_dict['appearances'])
print "Tourneys: " + str(makuuchi_record_dict['tourneys'])
print "Yusho: " + str(makuuchi_record_dict['yusho'])
print "Jun-Yusho: " + str(makuuchi_record_dict['jun_yusho'])
print "Gino-Sho: " + str(makuuchi_record_dict['gino_sho'])
print "Shukun-Sho: " + str(makuuchi_record_dict['shukun_sho'])
print "Kanto-Sho: " + str(makuuchi_record_dict['kanto_sho'])
print "Kinboshi: " + str(makuuchi_record_dict['kinboshi'])
print '\n'

Shikona: Hakuho Sho


--Details--
Rank: Yokozuna
Birthday: 1985-03-11 00:00:00
Age: 31.9041752225
Birth Place: Mongolia, Ulan-Bator
Height: 192.0
Weight: 152.9
Sumo Stable: Miyagino
Active Years: 15.9315537303
Debut: 2001-03-01 00:00:00
Entry Rank: None
Retirement: None


--Career Record--
Wins: 1019
Losses: 215
Withdrawals: 48
Appearances: 1232
Tourneys: 95


--Makuuchi Recod--
Wins: 925
Losses: 167
Withdrawals: 48
Appearances: 1090
Tourneys: 76
Yusho: 37
Jun-Yusho: 21
Gino-Sho: 2
Shukun-Sho: 3
Kanto-Sho: 1
Kinboshi: 1




## Store in Pandas Dataframe

In [6]:
cols = [u'shikona', u'highest_rank', u'bday', u'age', u'birth_place', u'height', u'weight', u'sumo_stable', \
        u'active_years', u'debut', u'entry_rank', u'retirement', u'career_record', u'makuuchi_record', \
        u'yokozuna_record', u'ozeki_record', u'sekiwake_record', u'komusubi_record', u'maegashira_record', \
        u'juryo_record', u'makushita_record', u'sandanme_record', u'jonidan_record', u'jonokuchi_record']
rikishi_df = pd.DataFrame(columns=cols) # initialize dataframe
compiled_stats = [shikona, highest_rank, bday, age, birth_place, height, weight, sumo_stable, \
                  active_years, debut, entry_rank, retirement, career_record_dict, makuuchi_record_dict, \
                  yokozuna_record_dict, ozeki_record_dict, sekiwake_record_dict, komusubi_record_dict, \
                  maegashira_record_dict, juryo_record_dict, makushita_record_dict, sandanme_record_dict, \
                  jonidan_record_dict, jonokuchi_record_dict]
rikishi_df.loc[len(rikishi_df.index)] = compiled_stats

rikishi_df.head()




Unnamed: 0,shikona,highest_rank,bday,age,birth_place,height,weight,sumo_stable,active_years,debut,entry_rank,retirement,career_record,makuuchi_record,yokozuna_record,ozeki_record,sekiwake_record,komusubi_record,maegashira_record,juryo_record,makushita_record,sandanme_record,jonidan_record,jonokuchi_record
0,Hakuho Sho,Yokozuna,1985-03-11,31.904175,"Mongolia, Ulan-Bator",192.0,152.9,Miyagino,15.931554,2001-03-01,,,"{u'appearances': 1232, u'jun_yusho': None, u'w...","{u'appearances': 1090, u'jun_yusho': 21, u'wit...","{u'appearances': 827, u'jun_yusho': 16, u'with...","{u'appearances': 90, u'jun_yusho': 1, u'withdr...","{u'appearances': 68, u'jun_yusho': 2, u'withdr...","{u'appearances': 30, u'jun_yusho': 1, u'withdr...","{u'appearances': 75, u'jun_yusho': 1, u'withdr...","{u'appearances': 30, u'jun_yusho': None, u'wit...","{u'appearances': 35, u'jun_yusho': None, u'wit...","{u'appearances': 42, u'jun_yusho': None, u'wit...","{u'appearances': 21, u'jun_yusho': None, u'wit...","{u'appearances': 14, u'jun_yusho': None, u'wit..."


In [30]:
rikishi_df.loc[0]['jonokuchi_record']

{u'appearances': 14,
 u'gino_sho': None,
 u'jun_yusho': None,
 u'kanto_sho': None,
 u'kinboshi': None,
 u'losses': 6,
 u'shukun_sho': None,
 u'tourneys': 2,
 u'wins': 8,
 u'withdrawals': None,
 u'yusho': None}

# Scratch Work

In [47]:
import re 
record = u'925-167-48/1090 (76 basho), 37 Yusho, 21 Jun-Yusho, 2 Gino-Sho, 3 Shukun-Sho, 1 Kanto-Sho, 1 Kinboshi'
reg_exp = re.compile('^(\d+)-(\d+)(?:-)?(\d+)?(-[a-zA-Z0-9-]+)?/(\d+)\s\((\d+)\sbasho\)(?:,\s)?(.+)?$', re.UNICODE)
result = reg_exp.match(record)
stats = result.groups()
parse_list = stats[6].split(', ')
print parse_list
print len(parse_list)
for award in parse_list:
    if 'Yusho' in award and 'Jun-Yusho' not in award:
        find_list = re.findall('\d+', award)
        print find_list
        yusho = int(find_list[0])
        print yusho
    if 'Jun-Yusho' in award:
        find_list = re.findall('\d+', award)
        print find_list
        jun_yusho = int(find_list[0])
        print jun_yusho

[u'37 Yusho', u'21 Jun-Yusho', u'2 Gino-Sho', u'3 Shukun-Sho', u'1 Kanto-Sho', u'1 Kinboshi']
6
[u'37']
37
[u'21']
21
