Install the dependency

In [None]:
!pip3 install nltk

Util Function for clean up

In [None]:
import os
import re
import pandas as pd
import numpy as np


def extract_text_from_folder(folder_path,column_name="text"):
    text_data = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith(('.txt', '.rst')):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    text_data.append(text)
    text_data_df = pd.DataFrame(np.array(text_data), columns=[column_name])
    return text_data_df
def extract_code_from_folder(folder_path,column_name="text"):
    text_data = []
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith(('.h')):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    text_data.append(text)
    text_data_df = pd.DataFrame(np.array(text_data), columns=[column_name])
    return text_data_df

def clean_text_data(text_data):
    # Remove Markdown links
    text_data = re.sub(r'\:link_to_translation\:\`[^`]*\`', '', text_data)
    
    # Remove code blocks
    text_data = re.sub(r'```.*?```', '', text_data, flags=re.DOTALL)
    
    # Remove HTML tags
    text_data = re.sub(r'<[^>]+>', '', text_data)
    
    # Remove section headers
    text_data = re.sub(r'^=+\n.*\n=+\n', '', text_data, flags=re.MULTILINE)
    text_data = re.sub(r'^-+\n.*\n-+\n', '', text_data, flags=re.MULTILINE)
    
    # Remove bulleted lists
    text_data = re.sub(r'^\s*\*\s+.*\n', '', text_data, flags=re.MULTILINE)
    
    # Remove numbered lists
    text_data = re.sub(r'^\s*\d+\.\s+.*\n', '', text_data, flags=re.MULTILINE)
    #Remove special sequence
    text_data=re.sub(r'====+','',text_data) 
    text_data=re.sub(r'\^\^\^\^+','',text_data) 
    text_data=re.sub(r'----+','',text_data) 
    text_data=re.sub(r'\*\*\*\*\*+','',text_data) 
    # Remove indentation and empty lines
    # text_data = re.sub(r'^\s+', '', text_data, flags=re.MULTILINE)
    # text_data = re.sub(r'^\n', '', text_data, flags=re.MULTILINE)
    
    return text_data

Split the textual data into chunk

In [None]:

import pandas as pd
import nltk.data

# Load the NLTK sentence tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


def split_text(text, chunk_size=500):
    # Define regex pattern for identifying sentence endings
    sentence_endings = r'[.!?]'
    chunks = []
    current_chunk = ""
    current_length = 0
    for char in text:
        current_chunk += char
        current_length += 1
        if re.search(sentence_endings, char):
            if current_length >= chunk_size:
                chunks.append(current_chunk)
                current_chunk = ""
                current_length = 0
    if current_chunk:  # Append any remaining part
        chunks.append(current_chunk)
    return chunks

def split_dataframe_manual(df, column_name='text', chunk_size=500):
    new_rows = []
    for index, row in df.iterrows():
        text = row[column_name]
        chunks = split_text(text, chunk_size)
        for chunk in chunks:
            new_row = row.copy()
            new_row[column_name] = chunk
            new_rows.append(new_row)
    return pd.DataFrame(new_rows)



Token distribution plotting

In [None]:
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def generate_and_tokenize_prompt(prompt):
    return tokenizer(prompt["text"])

def plot_data_lengths(tokenized_train_dataset,column_name="text"):
    lengths = [len(x[column_name]) for x in tokenized_train_dataset]
    # lengths += [len(x['text_data']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()



ESP_IDF docs cleanup

In [None]:
text_data = extract_text_from_folder("./raw_dataset_USFTHF/docs/",column_name="text")
text_data


In [None]:
clean_data = text_data.drop_duplicates()
clean_data = clean_data.map(clean_text_data)


In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(clean_data)
train_dataset= train_dataset.map()

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
# tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
plot_data_lengths(tokenized_train_dataset)

In [None]:
# split_data=split_dataframe_manual(clean_data,column_name="text",max_chunk_size=1000,max_total_size=4000)
split_data=split_dataframe_manual(clean_data,column_name="text",chunk_size=1000)

split_data
train_dataset = Dataset.from_pandas(split_data)
train_dataset

In [None]:
train_dataset

In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
plot_data_lengths(tokenized_train_dataset)


Remove extra lengthy word


In [None]:
split_data = split_data[split_data['text'].apply(lambda x: len(str(x)) <= 4000)]

train_dataset = Dataset.from_pandas(split_data)
train_dataset

In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
plot_data_lengths(tokenized_train_dataset)

In [None]:
train_dataset.push_to_hub("gouthamsk/esp_idf_text",split="train")

Cleanup ESP_idf code example

In [None]:
code_data = extract_code_from_folder("./raw_dataset_USFTHF/code/",column_name="text")
code_data.to_csv('clean_data.csv', index=False)
code_data

In [None]:
clean_code = code_data.drop_duplicates()
clean_code = code_data.map(clean_text_data)
code_data.to_csv('clean_code.csv', index=False)
clean_code

In [None]:
split_data=split_dataframe_manual(clean_code,column_name="text",chunk_size=1000)


In [None]:
split_data = split_data[split_data['text'].apply(lambda x: len(str(x)) <= 10000)]

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(split_data)
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
plot_data_lengths(tokenized_train_dataset)

In [None]:

train_dataset.push_to_hub("gouthamsk/esp_idf_code",split="train")


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
# Drop duplicate rows across all columns

train_data, test_data = train_test_split(new_df, test_size=0.1)


In [None]:
train_dataset = Dataset.from_pandas(split_train_data)
train_dataset.push_to_hub("gouthamsk/esp_idf_text",split="train")

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
# Drop duplicate rows across all columns
new_df = new_df.drop_duplicates()
train_data, test_data = train_test_split(new_df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(test_data)

In [None]:
test_data

In [None]:

train_dataset.push_to_hub("gouthamsk/esp_idf_text",split="train")
eval_dataset.push_to_hub("gouthamsk/esp_idf_text",split="test")

In [None]:

train_dataset,eval_dataset

In [None]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)


Handling Junk Data


In [None]:
!pip3 install justext

In [None]:
text = extract_text_from_folder("./raw_dataset_USFTHF/docs/",column_name="text")
text

In [None]:
import requests
import justext
paragraphs = justext.justext(text, justext.get_stoplist("English"))
for paragraph in paragraphs:
  if not paragraph.is_boilerplate:
    print(paragraph.text)

https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/api-conventions.html

In [None]:
links=[
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/api-conventions.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/mqtt.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/esp_tls.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/esp_http_client.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/esp_local_ctrl.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/esp_serial_slave_link.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/esp_crt_bundle.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/esp_http_server.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/esp_https_server.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/icmp_echo.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/protocols/mbedtls.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_bt_defs.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_bt_main.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_bt_device.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_gap_ble.html#",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_gatt_defs.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_gatts.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_gattc.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_blufi.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_gap_bt.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_a2dp.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_avrc.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_spp.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_hf_defs.html#",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_hf_client.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_hf_ag.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_hidd.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_l2cap_bt.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_sdp.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/controller_vhci.html#application-example",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp-ble-mesh.html#",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/nimble/index.html#threading-model",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/error-codes.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_now.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp-wifi-mesh.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_smartconfig.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_dpp.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_nan.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_eth.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_openthread.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_netif.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/network/esp_netif_driver.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/adc_oneshot.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/adc_continuous.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/adc_calibration.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/clk_tree.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/dac.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/gpio.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/gptimer.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/i2c.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/i2s.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/lcd.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/ledc.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/mcpwm.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/pcnt.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/rmt.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/sdspi_host.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/sdio_slave.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/sdm.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/spi_flash/index.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/spi_master.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/spi_slave.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/secure_element.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/touch_pad.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/twai.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/peripherals/uart.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/kconfig.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/provisioning/protocomm.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/provisioning/provisioning.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/provisioning/wifi_provisioning.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/fatfs.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/mass_mfg.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/nvs_flash.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/nvs_encryption.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/nvs_partition_gen.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/sdmmc.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/partition.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/spiffs.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/vfs.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/storage/wear-levelling.html"
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/app_image_format.html"
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/bootloader_image_format.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/app_trace.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/esp_function_with_shared_stack.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/chip_revision.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/console.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/efuse.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/esp_err.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/esp_https_ota.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/esp_event.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/freertos.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/freertos_idf.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/freertos_additions.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/mem_alloc.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/mm.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/heap_debug.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/esp_timer.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/internal-unstable.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/ipc.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/intr_alloc.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/log.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/misc_system_api.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/ota.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/perfmon.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/power_management.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/pthread.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/random.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/sleep_modes.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/soc_caps.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/system_time.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/himem.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/ulp.html",
    "https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/system/wdts.html"
    ]

In [None]:
import requests
import justext

response = requests.get("https://docs.espressif.com/projects/esp-idf/en/stable/esp32/api-reference/bluetooth/esp_gap_ble.html#")
paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
for paragraph in paragraphs:
  if not paragraph.is_boilerplate:
    print(paragraph.text)
paragraphs

In [None]:
!pip install lxml_html_clean


In [None]:
!pip3 install --force-reinstall --no-deps babel==2.13.1 
!pip3 install cchardet  # single package only
!pip3 install trafilatura[all] 

In [None]:
import trafilatura

urls = save
for url in urls:
    downloaded = trafilatura.fetch_url(url)
    if downloaded:
        extracted_text = trafilatura.extract(downloaded)
        print(f"Content from {url}:\n{extracted_text}\n")


In [None]:
result

In [None]:
data

In [64]:
file_path = "data.txt"

# Open the file in write mode ('w')
with open(file_path, 'w') as file:
    # Write the data to the file
    file.write(data["text"])

In [None]:
import requests
import justext

response = requests.get("https://dev.ti.com/tirex/explore/content/mspm0_academy_2_01_01_00/_build_mspm0_academy_2_01_01_00/index.html")
paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
file_path = "data2.txt"

# Open the file in write mode ('w')
with open(file_path, 'w') as file:
  a=""
    # Write the data to the file
  for paragraph in paragraphs:
    if not paragraph.is_boilerplate:
      file.write(paragraph.text)
      print(paragraph.text)
      a+=paragraph.text
  print(a)

In [None]:
!pip install trafilatura

In [81]:
from trafilatura.spider import focused_crawler

homepage = 'https://dev.ti.com/tirex/explore/content/mspm0_academy_2_01_01_00/_build_mspm0_academy_2_01_01_00/index.html'
# starting a crawl
to_visit, known_urls = focused_crawler(homepage, max_seen_urls=10, max_known_urls=100000)
# resuming a crawl


In [None]:
to_visit, known_urls = focused_crawler(homepage, max_seen_urls=10, max_known_urls=100000, todo=to_visit, known_links=known_urls)
print(len(to_visit), len(known_urls))

In [None]:
save =known_urls
save

In [24]:
import json
file_path = "url_list.json"
with open(file_path, 'w') as file:
    # json.dump(to_visit, file_path, indent=1) 
    for item in to_visit:
        file.write("%s,\n" % item)

In [110]:
# import the necessary functions
from trafilatura import fetch_url, extract
import json

# grab a HTML file to extract data from
for url in save:
    downloaded = fetch_url(url)
    data = extract(downloaded, output_format="json")
    data = json.loads(data)
    file_path = "data.jsonl"
    # Open the file in write mode ('w')
    with open(file_path, 'w') as file:
        # Write the data to the file
        file.write(data["text"])

In [114]:
!pip install datasets

[0m

In [115]:
from datasets import load_dataset, Dataset

# Load the JSONL file as a dataset
# dataset = load_dataset('json', data_files='data.jsonl', split='train')

# # Display the dataset
# print(dataset)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject