In [1]:
"""
Import Packages
"""

import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
import os 
import pandas as pd 

In [2]:
"""
Set working environment
"""
os.chdir('C:\\Users\\Administrator\\Desktop\\Projects\\VCT-Hackathon\\DataV2')


In [3]:
"""
Load the MD file 
"""
with open('valorant-text-data.md', 'r') as file:
    md_text = file.read()

In [4]:
"""
Chunk the MD file 
"""
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

# Split the chunks 
md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, 
    strip_headers=False
)
md_chunks = md_splitter.split_text(md_text)

# Print chunks 
for i, chunk in enumerate(md_chunks):
    print(f"Chunk {i+1}:\n")
    print(chunk.page_content)  
    print("\n---\n")  

Chunk 1:

# VCT Hackathon Text Data

---

Chunk 2:

# Game Description  
VALORANT is a tactical shooter game released by Riot Games. It consists of **2 teams of 5 players each** competing against one another. It was first released on 2 June 2020.  
VALORANT bears many similarities to Counter Strike: Global Offensive (CS:GO or CS), one of the most significant first-person shooters in esports history made by Valve.  Many players from CS transitioned to playing VALORANT.  
These similarities are not only found in the gameplay mechanics but also extend to the gaming slang, terms, and callouts used in both games. The shared terminology makes communication between players smoother, as many of the same strategies and role names are used in both communities, helping players transition easily from one game to the other. We go more into specific terminology in the later sections.  
Ten players are divided into two teams, **Attackers** and **Defenders**, with the goal of winning the most rounds. 

In [5]:
"""
Export chunked file as a JSON 
"""

chunks_list = [chunk.page_content for chunk in md_chunks]

chunks_json = json.dumps(chunks_list, indent=4)

# Save to a JSON file
with open('md_chunks.json', 'w') as json_file:
    json_file.write(chunks_json)

print("Exported md_chunks to md_chunks.json")

Exported md_chunks to md_chunks.json


In [7]:
"""
Validate the chunked md file 
"""

with open('C:\\Users\\Administrator\\Desktop\\Projects\\VCT-Hackathon\\DataV2\\chunked-valorant-text-data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Display the JSON data
print(json.dumps(data, indent=4))

[
    "# VCT Hackathon Text Data",
    "# Game Description  \nVALORANT is a tactical shooter game released by Riot Games. It consists of **2 teams of 5 players each** competing against one another. It was first released on 2 June 2020.  \nVALORANT bears many similarities to Counter Strike: Global Offensive (CS:GO or CS), one of the most significant first-person shooters in esports history made by Valve.  Many players from CS transitioned to playing VALORANT.  \nThese similarities are not only found in the gameplay mechanics but also extend to the gaming slang, terms, and callouts used in both games. The shared terminology makes communication between players smoother, as many of the same strategies and role names are used in both communities, helping players transition easily from one game to the other. We go more into specific terminology in the later sections.  \nTen players are divided into two teams, **Attackers** and **Defenders**, with the goal of winning the most rounds. They wou

In [8]:
"""
Resolve encoding issues 
"""

def fix_encoding_issues(data):
    replacements = {
        "â€™": "'",
        "â€”": "—",
        "Ã¢": "â",
        "Ã€": "À",
        "Ã©": "é",
        "â€œ": "“",
        "â€": "”",
        "â€¦": "…",
        "Â": "",  
    }
    
    if isinstance(data, str):
        for wrong, correct in replacements.items():
            data = data.replace(wrong, correct)
        return data
    
    elif isinstance(data, dict):
        return {key: fix_encoding_issues(value) for key, value in data.items()}
    
    elif isinstance(data, list):
        return [fix_encoding_issues(item) for item in data]
    
    return data

with open('C:\\Users\\Administrator\\Desktop\\Projects\\VCT-Hackathon\\DataV2\\chunked-valorant-text-data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

corrected_data = fix_encoding_issues(data)

print(json.dumps(corrected_data, indent=4))


[
    "# VCT Hackathon Text Data",
    "# Game Description  \nVALORANT is a tactical shooter game released by Riot Games. It consists of **2 teams of 5 players each** competing against one another. It was first released on 2 June 2020.  \nVALORANT bears many similarities to Counter Strike: Global Offensive (CS:GO or CS), one of the most significant first-person shooters in esports history made by Valve.  Many players from CS transitioned to playing VALORANT.  \nThese similarities are not only found in the gameplay mechanics but also extend to the gaming slang, terms, and callouts used in both games. The shared terminology makes communication between players smoother, as many of the same strategies and role names are used in both communities, helping players transition easily from one game to the other. We go more into specific terminology in the later sections.  \nTen players are divided into two teams, **Attackers** and **Defenders**, with the goal of winning the most rounds. They wou

In [10]:
"""
No further encoding issues, export as a JSON
"""

with open('valorant-chunked-data.json', 'w', encoding='utf-8') as json_file:
    json.dump(corrected_data, json_file, ensure_ascii=False, indent=4)