# Generate LLM tuning data for a diary 

In [3]:
import os
import pandas as pd
import re
import json
from datetime import datetime
import chardet
import textract
import pyarrow
import random
from dotenv import load_dotenv, find_dotenv


In [15]:
_ = load_dotenv(find_dotenv(), override=True) # read local .env file
TEXT_FILES_PATH = os.getenv('TEXT_FILES_PATH')  # Get the value of the API_KEY variable
BIRTH_DATE = os.getenv('BIRTH_DATE')
AUTHOR_NAME = os.getenv('AUTHOR_NAME')

PARQUET_OUTPUT='data/text_files_data.parquet'
JSON_OUTPUT='data/tuning_entries_all.jsonl'
JSON_OUTPUT_TRAINING='data/tuning_entries_training.jsonl'
JSON_OUTPUT_VALIDATION='data/tuning_entries_validation.jsonl'
JSON_OUTPUT_TEST='data/tuning_entries_test.jsonl'


In [9]:
def is_binary_file(file_path):
    try:
        with open(file_path, 'rb') as file:
            while True:
                chunk = file.read(1024)
                if not chunk:
                    break
                
                # Check for null bytes in the chunk
                if b'\x00' in chunk:
                    return True
                
                # Check for non-printable characters in the chunk
                if any(byte < 32 and byte not in (9, 10, 13) for byte in chunk):
                    return True
    except Exception as e:
        print(f"An error occurred while checking the file: {e}")
    
    return False

In [10]:
#empty list to store the data
data=[]
# Define the comparison date
comparison_date = datetime.strptime(BIRTH_DATE, '%Y-%m-%d')

# Walk through the directory containing text files

for root, dirs, files in os.walk(TEXT_FILES_PATH):
    for file in files:
        if file=='.DS_Store':
            continue
        # Get the date and year of the file
        file_path = os.path.join(root, file)
        file_stat = os.stat(file_path)
        file_timestamp = datetime.fromtimestamp(file_stat.st_mtime)
        file_date = datetime.fromtimestamp(file_stat.st_mtime).strftime('%Y-%m-%d')
        file_year = datetime.fromtimestamp(file_stat.st_mtime).strftime('%Y')

        # Check if the filename follows the DDMMYY format
        if len(file) == 10 and file[:6].isdigit():
            try:
                # Extract the date from the filename
                file_day = int(file[:2])
                file_month = int(file[2:4])
                file_year = 1900 + int(file[4:6])
                
                # Create a datetime object from the extracted date
                file_date = datetime(file_year, file_month, file_day)
                file_timestamp = datetime.fromtimestamp(file_date.timestamp())
                file_date = file_date.strftime('%Y-%m-%d')
                file_year = file_timestamp.strftime('%Y')

            except ValueError:
                print(f"File: {file}, Invalid Date Format")

        # Calculate the difference in years
        year_difference = file_timestamp.year - comparison_date.year - ((file_timestamp.month, file_timestamp.day) < (comparison_date.month, comparison_date.day))

        # Split the file name to get the extension (if any)
        _, file_extension = os.path.splitext(file)
        file_extension=file_extension.lower()
        if not is_binary_file(file_path) and (file_extension==".txt" or not file_extension):
            #get the encoding
            with open(file_path, 'rb') as f:
                result = chardet.detect(f.read())
                if (result['encoding']=='Windows-1252' and not file_extension):
                    encoding = 'MacRoman'
                else:
                    encoding = result['encoding']
            # Read the content of the file
            with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
                content = f.read()
        if file_extension==".doc" or file_extension==".docx":
            #maybe a doc file
            if file_extension==".doc":
                encoding = 'iso-8859-1'
            else:
                encoding = 'utf-8'
            try:
                content = textract.process(file_path, input_encoding=encoding).decode('utf-8')

#                content = textract.process(file_path)
            except Exception as e:
                print(f"An error occurred: {file_path} {e}")
                    # Append the data to the list

        word_count = len(content.split())  
        
        data.append([file_path, file_date, file_year, year_difference, content, word_count])


In [11]:
# Create a Pandas DataFrame
df = pd.DataFrame(data, columns=['source', 'date', 'year', 'age', 'text', 'word_count'])
print(df.shape)
# Sample 20 rows to inspect the contents
df.sample(20)


(2678, 6)


Unnamed: 0,source,date,year,age,text,word_count
1089,/Users/dpradilla/DPData/notas/veintidos/Post-C...,2010-04-08,2010,34,"<a href=""http://www.amazon.com/gp/product/0750...",689
383,/Users/dpradilla/DPData/notas/Literatura en li...,2008-10-20,2008,33,El oficio editorial en la web social\n\nCualqu...,977
783,/Users/dpradilla/DPData/notas/once/la dualidad...,2001-06-30,2001,26,Que interesante la dualidad de no saber si una...,41
1148,/Users/dpradilla/DPData/notas/veintidos/epigen...,2010-09-05,2010,35,La epigenética esta compuesta por Todos los el...,13
26,/Users/dpradilla/DPData/notas/Los bloqueadores...,2015-10-04,2015,40,"<img class=""aligncenter size-full wp-image-547...",200
113,/Users/dpradilla/DPData/notas/duermes y tus oj...,2001-10-07,2001,26,Yo dije\n¿y la vida?\nella contestó\nla vida s...,14
358,/Users/dpradilla/DPData/notas/seis/celeste,2000-04-17,2000,24,"Su prima se llamaba Celeste, cuando llegó al c...",625
974,/Users/dpradilla/DPData/notas/quince/consueloI...,2003-07-24,2003,28,\nSonríes llena vestida de negro\ncon todo el ...,45
670,/Users/dpradilla/DPData/notas/trece/La poesía,2002-02-01,2002,26,Estoy en una época en la que estoy negado a es...,105
1591,/Users/dpradilla/DPData/notas/nueve/Shelomo es...,2000-09-30,2000,25,Shelomo estaba extasiado mientras Juan hablaba\n,6


In [12]:
# Save the DataFrame as a Parquet file
df.to_parquet(PARQUET_OUTPUT)

In [6]:
df = pd.read_parquet(PARQUET_OUTPUT)

In [19]:
# Create a tuning file
tuning_entries = []

def sanitize_text(text):
    rtn = text
    clean = re.compile('<.*?>')
    rtn = re.sub(clean, '', text)
    # Escape double quotes within the JSON
    #rtn = rtn.replace('"', '\\"')
    # replace double quotes within the JSON
    rtn = rtn.replace('"', '')
    return rtn

def get_word_count_classification(word_count):
    rtn="corta"
    if word_count<=100:
        rtn="corta"
    elif word_count<=500:
        rtn="media"
    else:
        rtn="larga"
    return rtn


for index, row in df.iterrows():
    age = row['age']
    text = row['text']
    wc_class = get_word_count_classification(row['word_count'])
    # Apply the function to the 'Content' column
    text = sanitize_text(text)
    prompt = f'Escribe una entrada {wc_class} de diario por {AUTHOR_NAME} cuando tenía {age} años'
    entry = {'prompt': prompt, 'completion': text} #generic instruction format
#    entry = {'input': prompt, 'output': text} #generic instruction format
    #entry = {"text":f"<s>[INST]{prompt}[/INST] {text}</s>"} #mistral expected format
    tuning_entries.append(entry)

# Save the tuning file to a text file
with open(JSON_OUTPUT, 'w', encoding='utf-8') as file:
    for entry in tuning_entries:
        file.write(json.dumps(entry,ensure_ascii=False) + '\n')

print("Tuning file created successfully.")

Tuning file created successfully.


In [20]:
import random
import json

# Define the ratios
validation_ratio = 0.2
test_ratio = 0.0  # Example: 10% for testing

# Calculate the number of validation and test samples based on the ratios
num_samples = len(tuning_entries)
num_test_samples = int(num_samples * test_ratio)
num_validation_samples = int(num_samples * validation_ratio)

# Randomly shuffle the data
random.shuffle(tuning_entries)

# Split the data
test_data = tuning_entries[:num_test_samples]
remaining_data = tuning_entries[num_test_samples:]
validation_data = remaining_data[:num_validation_samples]
training_data = remaining_data[num_validation_samples:]

# Save the datasets to separate files
def save_data(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for item in data:
            file.write(json.dumps(item, ensure_ascii=False) + '\n')

save_data(training_data, JSON_OUTPUT_TRAINING)
save_data(validation_data, JSON_OUTPUT_VALIDATION)
save_data(test_data, JSON_OUTPUT_TEST)

# Print the counts of each dataset
print(f"Split {len(training_data)} samples for training, {len(validation_data)} samples for validation, and {len(test_data)} samples for testing.")


Split 2143 samples for training, 535 samples for validation, and 0 samples for testing.
