In [8]:
import pandas as pd
import random
import nltk

random_seed = 0
random.seed(random_seed)
rng = random.Random(random_seed)  # Create a random number generator with a seed

In [9]:
df = pd.read_json("../wikipedia_slice.jsonl", lines=True)
print(df.keys())
print(df.shape)

Index(['id', 'url', 'title', 'text'], dtype='object')
(30000, 4)


In [10]:
def is_valid_sentence_length(sentence, min_words=10, max_words=200):
    # Split the sentence into words and count the number of words
    word_count = len(nltk.word_tokenize(sentence))
    
    # Return True if the word count is between min_words and max_words
    return min_words < word_count < max_words
    

def extract_sentences_from_dataframe(df):
    sentences = []

    # Loop through each row in the dataframe
    for _, row in df.iterrows():
        # Extract the 'text' column and split it into sentences by newline
        row_sentences = row['text'].split("\n")
        
        # Filter out any empty sentences (in case there are blank lines) and too short or long sentences
        row_sentences = [s for s in row_sentences if is_valid_sentence_length(s)]
        
        # Append the sentences from this row to the list
        sentences.extend(row_sentences)

    return sentences

# Usage
sentences_list = extract_sentences_from_dataframe(df)

In [12]:
len(sentences_list), print(sentences_list[1])

Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in workers' struggles for emancipation. Various anarchist schools of thought formed during this period. Anarchists have taken part in several revolutions, most notably in the Paris Commune, the Russian Civil War and the Spanish Civil War, whose end marked the end of the classical era of anarchism. In the last decades of the 20th and into the 21st century, the anarchist movement has been resurgent once more.


(1474141, None)

In [14]:
filtered_sampled_sentences = random.sample(sentences_list, 6000)
print(len(filtered_sampled_sentences))

6000


In [15]:
# Convert the filtered sentences into a DataFrame with a word_count column
sentences_df = pd.DataFrame({'sentence': filtered_sampled_sentences, 
                             'word_count': [len(sentence.split()) for sentence in filtered_sampled_sentences]})

In [16]:
# Convert the result to a dataframe
print(sentences_df["word_count"].mean())
print(sentences_df)

50.27066666666666
                                               sentence  word_count
0     In 2020, Uproxx writer Josh Kurp stated that w...         152
1     Since at least 2013, scientists have been tryi...          31
2     Legend has it that Zhōu Yōu Wáng, king of the ...          69
3     In a process known as sowing, all the seeds fr...          79
4      July 3 – Sofia Alekseyevna of Russia, regent ...          10
...                                                 ...         ...
5995  By late 1982, Morrissey had chosen the band na...          80
5996  The upper culmination of the vernal point is c...          32
5997  The -dimensional hypercube obtained as the con...          43
5998   Sumner, William Hyslop, An Inquiry Into the I...          39
5999   Allegory: An extended metaphor wherein a stor...          14

[6000 rows x 2 columns]


In [17]:
# Write the sentences to the file, each separated by a newline
with open("../wiki_perplexity_sample.txt", 'w') as f:
    for sentence in sentences_df['sentence']:
        f.write(sentence + '\n')