In [6]:
import os
from pathlib import Path 
import pandas as pd
import json 
from tqdm import tqdm
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage 
from dotenv import load_dotenv 

load_dotenv() 

True

In [14]:
CLEAN_RAW_DIR = Path('../data/processed/cleaned_raw.json')
CLEAN_SECTIONS_DIR = Path('../data/processed/cleaned_sections.json') 
STYLED_DIR = Path('../data/styled')
STYLED_DIR.mkdir(parents=True, exist_ok=True)

In [15]:
# load files 
with open(CLEAN_RAW_DIR, 'r', encoding='utf-8') as f: 
    cleaned_raw = json.load(f) 
    
with open(CLEAN_SECTIONS_DIR, 'r', encoding='utf-8') as f: 
    cleaned_sections = json.load(f) 

In [31]:
# intialize the langchain groq client 
llm = ChatGroq(
    model='qwen/qwen3-32b',
    temperature=0.7
)

In [32]:
def style_text(
    text, 
    style = 'Write this text in my own grammar and tone, keep meaning same'
):
    if not text.strip(): 
        return ''
    try: 
        response = llm.invoke([
            SystemMessage(content=style),
            HumanMessage(content=text) 
        ])
        return response.content.strip() 
    except Exception as e: 
        print(f'Error styling: {e}')
        return text

In [28]:
# style the raw texts
styled_raw_dic = {}

for fname, text in tqdm(cleaned_raw.items(), desc='Styling raw docs'): 
    styled_raw_dic[fname] = style_text(text)

Styling raw docs: 100%|██████████| 31/31 [05:30<00:00, 10.66s/it]


In [29]:
styled_raw_dic 

{'122516956_CaseStudy_EEI5270.pdf': "Here's a rewritten version of the provided text in a more readable and grammatically correct tone:\n\n**Case Study: EEI Information Security - LBD Mawijesundara Spa**\n\n**Table of Contents**\n\n1. **Background and Threat Assessment**\n2. **Quantum Computing and Cryptography**\n\t* **Quantum Computing**: Understanding the basics of quantum computing and its potential impact on cryptography.\n\t* **Quantum Cryptography**: Exploring the relevance of quantum cryptography for developing nations.\n3. **Security Analysis of Thaproban**\n\t* **Key Vulnerabilities**: Identifying vulnerabilities in Thaproban's current systems.\n\t* **Systematic Issues**: Analyzing systematic issues that amplify the risk of a quantum attack.\n\t* **Consequences of a Quantum Attack**: Understanding the potential consequences of a quantum attack on Thaproban.\n4. **PQ Landscape**\n\t* **Recent Advances in PQC**: Discussing recent advances in post-quantum cryptography (PQC).\n\t

In [33]:
# style sectioned texts 
styled_section_dict = {}

for fname, sections in tqdm(cleaned_sections.items(), desc='Styling sectioned docs'): 
    styled_section_dict[fname] = {}
    for section, text in sections.items(): 
        styled_section_dict[fname][section] = style_text(text) 

Styling sectioned docs:   0%|          | 0/31 [00:00<?, ?it/s]

Styling sectioned docs: 100%|██████████| 31/31 [21:55<00:00, 42.42s/it]


In [35]:
# save jsons 
with open(os.path.join(STYLED_DIR, 'styled_raw.json'), 'w', encoding='utf-8') as f: 
    json.dump(styled_raw_dic, f, ensure_ascii=False, indent=2)
    
with open(os.path.join(STYLED_DIR, 'styled_sections.json'), 'w', encoding='utf-8') as f: 
    json.dump(styled_section_dict, f, ensure_ascii=False, indent=2)
    