Install the dependency

In [None]:
!pip3 install nltk

Util Function for clean up

In [1]:
import os
import re
import pandas as pd
import numpy as np


def extract_text_from_folder(folder_path,column_name="text"):
    text_data = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith(('.txt', '.rst')):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    text_data.append(text)
    text_data_df = pd.DataFrame(np.array(text_data), columns=[column_name])
    return text_data_df
def extract_code_from_folder(folder_path,column_name="text"):
    text_data = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith(('.h')):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    text_data.append(text)
    text_data_df = pd.DataFrame(np.array(text_data), columns=[column_name])
    return text_data_df

def clean_text_data(text_data):
    # Remove Markdown links
    text_data = re.sub(r'\:link_to_translation\:\`[^`]*\`', '', text_data)
    
    # Remove code blocks
    text_data = re.sub(r'```.*?```', '', text_data, flags=re.DOTALL)
    
    # Remove HTML tags
    text_data = re.sub(r'<[^>]+>', '', text_data)
    
    # Remove section headers
    text_data = re.sub(r'^=+\n.*\n=+\n', '', text_data, flags=re.MULTILINE)
    text_data = re.sub(r'^-+\n.*\n-+\n', '', text_data, flags=re.MULTILINE)
    
    # Remove bulleted lists
    text_data = re.sub(r'^\s*\*\s+.*\n', '', text_data, flags=re.MULTILINE)
    
    # Remove numbered lists
    text_data = re.sub(r'^\s*\d+\.\s+.*\n', '', text_data, flags=re.MULTILINE)
    #Remove special sequence
    text_data=re.sub(r'====+','',text_data) 
    text_data=re.sub(r'\^\^\^\^+','',text_data) 
    text_data=re.sub(r'----+','',text_data) 
    text_data=re.sub(r'\*\*\*\*\*+','',text_data) 
    # Remove indentation and empty lines
    # text_data = re.sub(r'^\s+', '', text_data, flags=re.MULTILINE)
    # text_data = re.sub(r'^\n', '', text_data, flags=re.MULTILINE)
    
    return text_data

Split the textual data into chunk

In [None]:

import pandas as pd
import nltk.data

# Load the NLTK sentence tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


def split_text(text, chunk_size=500):
    # Define regex pattern for identifying sentence endings
    sentence_endings = r'[.!?]'
    chunks = []
    current_chunk = ""
    current_length = 0
    for char in text:
        current_chunk += char
        current_length += 1
        if re.search(sentence_endings, char):
            if current_length >= chunk_size:
                chunks.append(current_chunk)
                current_chunk = ""
                current_length = 0
    if current_chunk:  # Append any remaining part
        chunks.append(current_chunk)
    return chunks

def split_dataframe_manual(df, column_name='text', chunk_size=500):
    new_rows = []
    for index, row in df.iterrows():
        text = row[column_name]
        chunks = split_text(text, chunk_size)
        for chunk in chunks:
            new_row = row.copy()
            new_row[column_name] = chunk
            new_rows.append(new_row)
    return pd.DataFrame(new_rows)



Token distribution plotting

In [None]:
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def generate_and_tokenize_prompt(prompt):
    return tokenizer(prompt["text"])

def plot_data_lengths(tokenized_train_dataset,column_name="text"):
    lengths = [len(x[column_name]) for x in tokenized_train_dataset]
    # lengths += [len(x['text_data']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()



ESP_IDF docs cleanup

In [None]:
text_data = extract_text_from_folder("./raw_dataset_USFTHF/docs/",column_name="text")
text_data


In [None]:
clean_data = text_data.drop_duplicates()
clean_data = clean_data.map(clean_text_data)


In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(clean_data)
train_dataset= train_dataset.map()

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
# tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
plot_data_lengths(tokenized_train_dataset)

In [None]:
# split_data=split_dataframe_manual(clean_data,column_name="text",max_chunk_size=1000,max_total_size=4000)
split_data=split_dataframe_manual(clean_data,column_name="text",chunk_size=1000)

split_data
train_dataset = Dataset.from_pandas(split_data)
train_dataset

In [None]:
train_dataset

In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
plot_data_lengths(tokenized_train_dataset)


Remove extra lengthy word


In [None]:
split_data = split_data[split_data['text'].apply(lambda x: len(str(x)) <= 4000)]

train_dataset = Dataset.from_pandas(split_data)
train_dataset

In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
plot_data_lengths(tokenized_train_dataset)

In [None]:
train_dataset.push_to_hub("gouthamsk/esp_idf_text",split="train")

Cleanup ESP_idf code example

In [3]:
code_data = extract_code_from_folder("./raw_dataset_USFTHF/code/",column_name="text")
code_data.to_csv('clean_data.csv', index=False)
code_data

Unnamed: 0,text
0,/*\n * SPDX-FileCopyrightText: 2022-2023 Espre...
1,/*\n * SPDX-FileCopyrightText: 2024 Espressif ...
2,/*\n * SPDX-FileCopyrightText: 2023 Espressif ...
3,/*\n * SPDX-FileCopyrightText: 2017-2021 Espre...
4,/*\n * SPDX-FileCopyrightText: 2015-2021 Espre...
...,...
4161,#ifndef __HTTPD_TESTS_H__\n#define __HTTPD_TES...
4162,/*\n * SPDX-FileCopyrightText: 2021-2023 Espre...
4163,/*\n * SPDX-FileCopyrightText: 2022-2023 Espre...
4164,/* Keep Alive engine for wss server example\n\...


In [None]:
clean_code = code_data.drop_duplicates()
# clean_code = code_data.map(clean_text_data)

# clean_code

In [None]:
split_data=split_dataframe_manual(clean_code,column_name="text",chunk_size=1000)


In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(clean_code)
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)

In [None]:
plot_data_lengths(tokenized_train_dataset)

In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
# Drop duplicate rows across all columns

train_data, test_data = train_test_split(new_df, test_size=0.1)


In [None]:
train_dataset = Dataset.from_pandas(split_train_data)
train_dataset.push_to_hub("gouthamsk/esp_idf_text",split="train")

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
# Drop duplicate rows across all columns
new_df = new_df.drop_duplicates()
train_data, test_data = train_test_split(new_df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(test_data)

In [None]:
test_data

In [None]:

train_dataset.push_to_hub("gouthamsk/esp_idf_text",split="train")
eval_dataset.push_to_hub("gouthamsk/esp_idf_text",split="test")

In [None]:

train_dataset,eval_dataset

In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)


Bellow Code is to clean up code dataset
