# Dependencies

In [1]:
import os
import json
import tiktoken
import openai
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import docx
import tiktoken
import random
from dotenv import load_dotenv

In [2]:
pip install openai tiktoken python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Script Overview

This Python script processes all `.pdf` and `.docx` files in a specified folder (`folder_path`) by extracting text from each file and limiting the content to 350 tokens. The extracted text is then saved as individual `.txt` files in an output folder (`output_folder`).

## How It Works

1. **Token Limiting**: The script uses `tiktoken` to limit extracted text to 350 tokens.
2. **Text Extraction**:
   - PDF files are processed with `PyPDF2`, extracting text from each page until the token limit is reached.
   - DOCX files are processed with `python-docx`, extracting text from all paragraphs.
3. **Saving Results**: Each file’s processed content is saved as a `.txt` file in the output folder.

## Key Variables

- `folder_path`: Path to the directory containing the input files.
- `output_folder`: Path where the processed `.txt` files are saved.

Run the script to generate processed `.txt` files for each PDF or DOCX in the specified folder.


In [8]:
import os
from PyPDF2 import PdfReader
import docx
import tiktoken
from dotenv import load_dotenv

load_dotenv()
tokenizer = tiktoken.get_encoding("cl100k_base")

def limit_to_350_tokens(text):
    tokens = tokenizer.encode(text)
    if len(tokens) > 350:
        tokens = tokens[:350]
    return tokenizer.decode(tokens)

def extract_pdf_text(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
        if len(tokenizer.encode(text)) >= 350:
            break
    return limit_to_350_tokens(text)

def extract_docx_text(file_path):
    doc = docx.Document(file_path)
    text = " ".join([para.text for para in doc.paragraphs])
    return limit_to_350_tokens(text)

def process_files_in_folder(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for root, dirs, files in os.walk(folder_path): 
        for filename in files:
            file_path = os.path.join(root, filename)
            if filename.endswith('.pdf'):
                text_excerpt = extract_pdf_text(file_path)
            elif filename.endswith('.docx'):
                text_excerpt = extract_docx_text(file_path)
            else:
                continue

            output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0] + ".txt")
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(text_excerpt)

folder_path = "./hngr-isps"
output_folder = "./prompts"
process_files_in_folder(folder_path, output_folder)
print(f"Processed files saved to {output_folder}")

Processed files saved to ./prompts


In [7]:
import os
import openai
from dotenv import load_dotenv
import random

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def ask_ai(prompt, text_excerpt, model="gpt-3.5-turbo-0125"):
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": text_excerpt}
        ],
        max_tokens=500
    )
    return response.choices[0].message.content

def log_skipped_file(filename, log_path, reason):
    with open(log_path, 'a') as log_file:
        log_file.write(f"{filename} - Skipped due to {reason}\n")
    print(f"Skipping {filename}: {reason}")

def save_ai_response(filename, response, output_folder):
    response_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_response.txt")
    with open(response_file_path, 'w', encoding='utf-8') as response_file:
        response_file.write(response)
    print(f"AI response saved for {filename}")

def main():
    output_folder = "./prompts"
    ai_responses_folder = "./ai_responses"
    skipped_log_path = "./skipped_files.txt"

    if not os.path.exists(ai_responses_folder):
        os.makedirs(ai_responses_folder)

    with open(skipped_log_path, 'w') as log_file:
        log_file.write("Skipped Files Log\n==================\n")

    files = [f for f in os.listdir(output_folder) if f.endswith('.txt')]
    if not files:
        raise FileNotFoundError("No .txt files found in the output folder.")

    num_documents = input("Enter the number of documents to classify (or 'all' to classify all documents): ")
    selected_files = files if num_documents.lower() == 'all' else random.sample(files, min(int(num_documents), len(files)))

    prompt = (
        "From the following text, extract the following details:\n"
        "- Year\n"
        "- City\n"
        "- Country\n"
        "- Major/Field of Study\n"
        "- Themes\n"
        "- Author Name\n"
        "- Advisor\n\n"
        "Output format:\n"
        "Year: [Year]\n"
        "City: [City]\n"
        "Country: [Country]\n"
        "Major/Field: [Major/Field]\n"
        "Themes: [Themes]\n"
        "Author: [Author]\n"
        "Advisor: [Advisor]"
    )

    for filename in selected_files:
        file_path = os.path.join(output_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text_excerpt = f.read().strip()

        if not text_excerpt:
            log_skipped_file(filename, skipped_log_path, reason="empty content")
            continue

        response = ask_ai(prompt, text_excerpt)
        save_ai_response(filename, response, ai_responses_folder)

    print(f"\nLog of skipped files saved to {skipped_log_path}")

main()

Enter the number of documents to classify (or 'all' to classify all documents):  all


AI response saved for Brown, Carolyn--Final ISP.txt
AI response saved for Wilson ISP.txt
AI response saved for Halim, Valerie-Final ISP.txt
AI response saved for Willig ISP.txt
AI response saved for Sharp, Trudy.txt


KeyboardInterrupt: 

In [9]:
import os
import sqlite3
import json

def parse_ai_responses_to_dict(ai_responses_folder):
    parsed_data = {}

    for filename in os.listdir(ai_responses_folder):
        file_path = os.path.join(ai_responses_folder, filename)
        
        if os.path.isfile(file_path): 
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                
                details = {}
                for line in content.split('\n'):
                    if ':' in line:
                        key, value = line.split(':', 1)
                        details[key.strip()] = value.strip()

                parsed_data[filename] = details

    return parsed_data

def insert_data_into_db(parsed_data, db_path='ai_responses.db'):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create table if it doesn't exist
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS ai_responses (
            filename TEXT PRIMARY KEY,
            year TEXT,
            city TEXT,
            country TEXT,
            major_field TEXT,
            themes TEXT,
            author TEXT,
            advisor TEXT
        )
    ''')

    # Insert data into the database
    for filename, details in parsed_data.items():
        cursor.execute('''
            INSERT OR REPLACE INTO ai_responses (filename, year, city, country, major_field, themes, author, advisor)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            filename,
            details.get('Year', ''),
            details.get('City', ''),
            details.get('Country', ''),
            details.get('Major/Field', ''),
            details.get('Themes', ''),
            details.get('Author', ''),
            details.get('Advisor', '')
        ))

    conn.commit()
    conn.close()
    print(f"\nData has been inserted into the SQLite database at '{db_path}'.")

# Folder containing the AI response files
ai_responses_folder = "./ai_responses"

# Parse responses and insert into the database
parsed_data = parse_ai_responses_to_dict(ai_responses_folder)
insert_data_into_db(parsed_data)


Data has been inserted into the SQLite database at 'ai_responses.db'.
