In [6]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import time
import csv

# Objective: To scrape Paul Morphy's chess games on chess.com, sectioned off into white games and black games.
# Will then use the scraped data to clean, feature engineer, make visualizations and light exploratory analysis with.

In [7]:
URL = input('Type in URL:')
print('Here we go!')
# 'https://www.chess.com/games/search?p1=paul-morphy&sort=3' # white games
# 'https://www.chess.com/games/search?p1=paul-morphy&sort=4' # black games

Type in URL:https://www.chess.com/games/search?p1=paul-morphy&sort=4
Here we go!


In [8]:
# Get the players names and their ratings if available
def get_players(game):
    users = game.find_all('span', class_= {'master-games-username', 'master-games-user-rating'})
    n = len(users)
    if n > 3:
        p1, p1_rating, p2, p2_rating = users
        return p1.string + " " + p1_rating.string + " " + p2.string + " " + p2_rating.string
    else:
        p1, p1_rating, p2 = users
        return p1.string + " " + p1_rating.string + " " + p2.string

# Get the opening moves and opening name as well as the varation if available
def get_opening(game):
    opening = game.find('a', class_='master-games-content-stats master-games-opening')
    opening_moves = opening['title']
    opening_name = opening.contents[3].string.lstrip()
    return opening_moves + " " + opening_name

# Get the result of the game
def get_result(game):
    result = game.find('a', class_= 'master-games-clickable-link master-games-text-middle').string.strip()
    return result

# Get the number of moves of the game
def get_num_moves(game):
    num_moves = game.find_all('a', class_= 'master-games-clickable-link master-games-text-middle')[1]
    num_moves = num_moves.text.strip()
    return num_moves

# Get the year the game was played
def get_year(game):
    year = game.find('a', class_="master-games-date master-games-clickable-link master-games-text-middle")['title']
    return year

rows = []
header = ['Players', 'Opening', 'Result', 'Moves', 'Year'] # column names

# Scrape all the games from every page into a lists of lists
def scrape(url):
    current_page = 1
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    max_pages = int(soup.find(id="master-games-container")['data-total-pages'])
    
    while current_page <= max_pages:
        current_url = url + '&page=' + str(current_page)
        html = requests.get(current_url)
        soup = BeautifulSoup(html.content, 'html.parser')
        games = soup.find_all("tr", class_="master-games-master-game v-board-popover")
        for game in games:
            rows.append([get_players(game),
                        get_opening(game),
                        get_result(game),
                        get_num_moves(game),
                        get_year(game)])
        current_page += 1
        time.sleep(5)
    return rows

rows = scrape(URL)
name_of_csv = input('Name your csv file! White or black?')
with open(name_of_csv.lower() + 'games.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(header)
    csvwriter.writerows(rows)


Name your csv file:blackgames
