In [119]:
import pandas as pd
from lorem_text import lorem

In [120]:
# 1. Define the source "Lorem Ipsum" text
lorem_ipsum_source = lorem.paragraphs(500)

lorem_ipsum_source

'Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\nFacilis voluptate expedita sint similique blanditiis, quod quidem alias adipisci laboriosam sint dolor soluta eaque rem est? Esse labore vel accusantium ratione reprehenderit mollitia, voluptatum accusantium atque consectetur corrupti ullam modi rem rerum iure nisi, quisquam velit ducimus nobis nisi illo sapiente, autem aliquid repellendus quia et saepe, mollitia fuga enim a eos earum ex dolores. Rerum officiis asperiores velit molestias quos delectus, mollitia sapiente saepe nobis voluptates. Quis debitis minima optio sapiente molestias iure do

In [121]:
# 2. Generation Parameters
# max_words: The total number of words in the longest text entry.
# word_interval: The number of new words to add for each subsequent row.
max_words = 10000 
word_interval = 5 # The number of words to add for each new row

# 3. Prepare the data generation
words = lorem_ipsum_source.split()
dataset_rows = []

# Ensure the requested number of words does not exceed the source text length
if max_words > len(words):
    print(f"Warning: max_words ({max_words}) exceeds the number of available words in the source text ({len(words)}).")
    print(f"The dataset will be capped at {len(words)} rows.")
    max_words = len(words)

# 4. Loop to generate each row of the dataset using the specified interval
# The range function now starts at 'word_interval' and steps by 'word_interval'.
# We use enumerate to generate a unique ID for each row, starting from 0.
for idx, i in enumerate(range(word_interval, max_words + 1, word_interval)):
    # Select the first 'i' words from the source list
    current_words_list = words[:i]
    
    # Join the words to form the text for the current row
    text = " ".join(current_words_list)
    
    # Get the last word
    last_word = current_words_list[-1]
    
    # Append the generated data as a dictionary to our list
    dataset_rows.append({
        'id': idx,
        'text': text,
        'char_count': len(text),
        'word_count': i,
        'last_word': last_word
    })

# 5. Create the final pandas DataFrame
df_lorem = pd.DataFrame(dataset_rows)

# 6. Display the head and tail of the generated DataFrame to verify
print(f"Successfully generated a dataset with {len(df_lorem)} rows.")
print("\nFirst 5 rows:")
display(df_lorem.head())

print("\nLast 5 rows:")
display(df_lorem.tail())

Successfully generated a dataset with 2000 rows.

First 5 rows:


Unnamed: 0,id,text,char_count,word_count,last_word
0,0,"Lorem ipsum dolor sit amet,",27,5,"amet,"
1,1,"Lorem ipsum dolor sit amet, consectetur adipis...",64,10,do
2,2,"Lorem ipsum dolor sit amet, consectetur adipis...",100,15,labore
3,3,"Lorem ipsum dolor sit amet, consectetur adipis...",127,20,Ut
4,4,"Lorem ipsum dolor sit amet, consectetur adipis...",154,25,quis



Last 5 rows:


Unnamed: 0,id,text,char_count,word_count,last_word
1995,1995,"Lorem ipsum dolor sit amet, consectetur adipis...",75305,9980,"culpa,"
1996,1996,"Lorem ipsum dolor sit amet, consectetur adipis...",75344,9985,corporis
1997,1997,"Lorem ipsum dolor sit amet, consectetur adipis...",75382,9990,veniam
1998,1998,"Lorem ipsum dolor sit amet, consectetur adipis...",75411,9995,beatae
1999,1999,"Lorem ipsum dolor sit amet, consectetur adipis...",75445,10000,laborum


In [122]:
# 7. Export the DataFrame to a CSV file
output_filename = '../dataset/lorem_ipsum_dataset.csv'
df_lorem.to_csv(output_filename, index=False)

print(f"DataFrame successfully exported to '{output_filename}'.")

DataFrame successfully exported to '../dataset/lorem_ipsum_dataset.csv'.
