In [None]:
import pandas as pd
import requests as r
from bs4 import BeautifulSoup
from pprint import pprint

class StrategyError(Exception):
    pass

fighters = pd.read_csv('fighter_sot.csv', encoding='iso-8859-1', low_memory=False)

In [None]:
not_missing = fighters['type'] == 'e'
not_found = fighters['fighter_name'].isnull()

In [None]:
only_errors = fighters[not_found & not_missing]

In [None]:
correct = [1,2,3]
incorrect = [128663, 128669, 128769]
sample = correct + incorrect

In [None]:
def find_name(soup):
    name_div = soup.find('div', {"class": "module bio_fighter vcard"})
    fighter_name = soup.find('h1', {'itemprop': 'name'}).span.contents[0]
    return fighter_name


def find_pro_fights(soup, fighter_id):
    fight_divs = soup.findAll('div', {"class": "module fight_history"})
    for section in fight_divs:
        section_type = section.find('div', {"class": "module_header"}).contents[1].text
        if 'Fight History - Pro' == section_type:
            cells = []
            event_date = 'NO FIGHTS FOUND'
            for row in section.findAll('tr')[1:]:
                result_cell, fighter_cell, event_cell, *blah = row.findAll('td')
                result = result_cell.span.contents[0]
                opponent_id_url = fighter_cell.contents[0]['href']
                try:
                    opponent_id = int(opponent_id_url[opponent_id_url.rfind("-") + 1:])
                except ValueError:
                    # opponent is 'Unknown opponent'
                    opponent_id = 0
                event_link = event_cell.contents[0]['href']
                event_date = event_cell.findAll('span')[-1].contents[0]
                fighter_out = (fighter_id, opponent_id, result, event_date, event_link)
                cells += [fighter_out]
                
            return cells
    print(fighter_id)
    raise StrategyError('didnt find any pro fights')
    
    
def parse_fighter_page(page_soup, fighter_id):
    fighter_info = {}
    try:
        fighter_info['fighter_name'] = find_name(page_soup)
        fighter_info['type'] = 'f'
    except AttributeError:
        fighter_info['fighter_name'] = 'no_fighter'
        fighter_info['type'] = 'm'
        return fighter_info
    try:
        fighter_info['matches'] = find_pro_fights(page_soup, fighter_id)
    except StrategyError:
        print('Could not find pro fights!')
        fighter_info['type'] = 'a'
        fighter_info['matches'] = []
    return fighter_info

def verify(row):
#     try:
    name = row['fighter_name']
    fighter_id = row['id']
    row_type = row['type']
#     except TypeError:
#         row = {'id': row}
#         fighter_id = row['id']
    
    page = r.get("http://www.sherdog.com/fighter/index?id={}".format(fighter_id), timeout=3).text
    my_soup = BeautifulSoup(page, 'html.parser')

    fighter_info = parse_fighter_page(my_soup, fighter_id)
    row['fighter_name'] = fighter_info['fighter_name']
    row['type'] = fighter_info['type']
    row['matches'] = fighter_info['matches']
    print(row['id'], row['fighter_name'])
    return row
    

In [None]:
only_errors = only_errors.apply(verify, axis=1)

In [None]:
only_errors[only_errors['id'] >= 135185]