In [63]:
import os
import pandas as pd 
from tqdm import tqdm
from os.path import join
if os.getcwd().split('\\')[-1] != 'irl-chess':
    os.chdir('../')

In [85]:
import requests
import pyzstd

def decompress_zstd(zstd_path, extract_path):
    try:
        with open(zstd_path, 'rb') as zstd_file:
            compressed_data = zstd_file.read()
            decompressed_data = pyzstd.decompress(compressed_data)
            
        destination_path = extract_path
        
        with open(destination_path, 'wb') as decompressed_file:
            decompressed_file.write(decompressed_data)
        
        print(f"Decompressed: {zstd_path}")
    except Exception as e:
        print(f"Failed to decompress {zstd_path}: {e}")

def download_file(url, destination):
    try:
        response = requests.get(url)
        with open(destination, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {url}")
        return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False

def parse_moves(moves):
    out = ''
    for el in moves.split(' '):
        if '.' not in el:
            out += el + ','
    return out


def txt_to_csv(filename):
    with open(filename, 'r') as f:
        game = []
        for line in tqdm(f.readlines(), 'Converting to csv'):
            if line[0] == '1':
                game.append(parse_moves(line.strip()))
                data_raw.append(game)
                game = []
            elif line.strip():
                game.append(line.split('"')[-2])
    filename_out = filename[:-4].replace('raw', 'processed') + '.csv'
    # For some reason pandas seems to add a column of None values...
    df = pd.DataFrame(data_raw, columns=columns + ['None'])
    df.iloc[:, :-1].to_csv(filename_out, index=False)
    
    
def download_lichess_pgn(websites_list, file_path_data):
    try:
        with open(websites_list, 'r') as filename:
            urls = filename.readlines()
            urls = [url.strip() for url in urls]
    
        for i, url in enumerate(urls, start=1):
            destination = join(file_path_data, url.split("/")[-1])
            print(f'\n\n-------------------  {i}/{len(urls)}  -------------------\n\n')
            if download_file(url, destination):
                filepath_out = destination[:-4]
                decompress_zstd(destination, extract_path=filepath_out)
                os.remove(destination)
                print(f'Unzipped and deleted the zip file!')
                txt_to_csv(filepath_out)
                print(f'Converted the .txt to .csv!')
            
            if i == n_files:
                break
    except FileNotFoundError:
        print(f"File not found: {websites_list}")
    except Exception as e:
        print(f"An error occurred: {e}")


In [86]:
n_files = 2
websites_list = join(os.getcwd(), 'downloads', 'lichess_websites.txt')
file_path_data = join(os.getcwd(), 'data', 'raw')

download_lichess_pgn(websites_list, file_path_data)



-------------------  1/2  -------------------

Downloaded: https://database.lichess.org/standard/lichess_db_standard_rated_2013-01.pgn.zst
Decompressed: C:\Users\toell\OneDrive\Documents\GitHub\irl-chess\data\raw\lichess_db_standard_rated_2013-01.pgn.zst
Unzipped and deleted the zip file!


Converting to csv: 100%|██████████| 2183562/2183562 [00:03<00:00, 624028.70it/s]


Converted the .txt to .csv!


-------------------  2/2  -------------------

Downloaded: https://database.lichess.org/standard/lichess_db_standard_rated_2013-02.pgn.zst
Decompressed: C:\Users\toell\OneDrive\Documents\GitHub\irl-chess\data\raw\lichess_db_standard_rated_2013-02.pgn.zst
Unzipped and deleted the zip file!


Converting to csv: 100%|██████████| 2230831/2230831 [00:03<00:00, 593550.94it/s]


Converted the .txt to .csv!


In [76]:
websites_list

'C:\\Users\\toell\\OneDrive\\Documents\\GitHub\\irl-chess\\downloads\\lichess_websites.txt'

In [69]:
files = [join(file_path_data, f'lichess_db_standard_rated_{year}-{month}.pgn') for year, month in [('2013', '01')]]
columns = ['Event', 'White', 'Site', 'Black', 'Result', 'UTCDate', 'UTCTime', 'WhiteElo', 'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'ECO', 'Opening', 'TimeControl', 'Termination', 'Moves']

data_raw = []
for filename in files:
    txt_to_csv(filename)


100%|██████████| 2183562/2183562 [00:04<00:00, 507611.82it/s]
