Using: https://github.com/bnsreenu/python_for_microscopists/blob/master/311_fine_tuning_GPT2.ipynb

Note: ```--break-system-packages``` is only required for my customised WSL2 Ubuntu 22.04

In [1]:
!pip install -q transformers torch datasets python-docx --break-system-packages

In [2]:
!pip install -qU PyPDF2 --break-system-packages

In [3]:
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx

In [4]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

Got ```godot-docs``` through ```https://docs.godotengine.org/en/stable/index.html``` using the ```stable``` variety. As of today it's ```319mb```.

Extract contents into ```godot-docs-html-stable``` then run below to extract data from the html (dirty formatting for v1).

In [10]:
%%time
# Convert html files recursively into txt files without format
import os
from bs4 import BeautifulSoup

def convert_html_to_txt(html_file_path, txt_file_path):
    with open(html_file_path, 'r', encoding='utf-8') as html_file:
        soup = BeautifulSoup(html_file, 'html.parser')
        text = soup.get_text()
        
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

def process_folder(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.html'):
                print('processing:',file)
                html_file_path = os.path.join(root, file)
                relative_path = os.path.relpath(html_file_path, input_folder)
                txt_file_path = os.path.join(output_folder, os.path.splitext(relative_path)[0] + '.txt')
                splt_path = txt_file_path.split('/')
                #print(splt_path[-1])
                splt_path[-1] = splt_path[-2] + '-' + splt_path[-1]
                convert_html_to_txt(html_file_path, output_folder + '/' + splt_path[-1])
            else:
                print(' - skipped:',file)

input_folder = 'godot-docs-html-stable/tutorials'
output_folder = 'godot-docs-text'

process_folder(input_folder, output_folder)

processing: troubleshooting.html
processing: 2d_antialiasing.html
processing: 2d_lights_and_shadows.html
processing: 2d_meshes.html
processing: 2d_movement.html
processing: 2d_parallax.html
processing: 2d_sprite_animation.html
processing: 2d_transforms.html
processing: canvas_layers.html
processing: custom_drawing_in_2d.html
processing: index.html
processing: particle_systems_2d.html
processing: using_tilemaps.html
processing: using_tilesets.html
processing: 3d_antialiasing.html
processing: 3d_rendering_limitations.html
processing: 3d_text.html
processing: csg_tools.html
processing: environment_and_post_processing.html
processing: high_dynamic_range.html
processing: index.html
processing: introduction_to_3d.html
processing: lights_and_shadows.html
processing: mesh_lod.html
processing: occlusion_culling.html
processing: physical_light_and_camera_units.html
processing: resolution_scaling.html
processing: standard_material_3d.html
processing: using_decals.html
processing: using_gridmaps.h

processing: compositor.html
processing: index.html
processing: jitter_stutter.html
processing: multiple_resolutions.html
processing: viewports.html
processing: change_scenes_manually.html
processing: creating_script_templates.html
processing: cross_language_scripting.html
processing: evaluating_expressions.html
processing: filesystem.html
processing: groups.html
processing: how_to_read_the_godot_api.html
processing: idle_and_physics_processing.html
processing: index.html
processing: instancing_with_signals.html
processing: nodes_and_scene_instances.html
processing: overridable_functions.html
processing: pausing_games.html
processing: resources.html
processing: scene_tree.html
processing: scene_unique_nodes.html
processing: singletons_autoload.html
processing: c_sharp_basics.html
processing: c_sharp_collections.html
processing: c_sharp_differences.html
processing: c_sharp_exports.html
processing: c_sharp_features.html
processing: c_sharp_global_classes.html
processing: c_sharp_signals.h