In [1]:
!pip install meaningless
!pip install tiktoken
!pip install bs4
!pip install gspread
!pip install oauth2client

Collecting meaningless
  Downloading meaningless-1.0.0-py3-none-any.whl (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.21 (from meaningless)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xmltodict>=0.13.0 (from meaningless)
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.21->meaningless)
  Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.7/526.7 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstal

In [2]:
# Reading CSV & JSON data and scraping Wikipedia
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import json
from meaningless import WebExtractor
import tiktoken
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import re

In [3]:
csv.field_size_limit(2147483647)

131072

In [4]:
# Function to extract sentences from data
def extract_sentences(data, extraction):
    sentences = []
    for item in data:
        try:
            sentences.append(item[extraction])
        except KeyError:
            print(f"Key '{extraction}' not found in data item. Skipping this item.")
    return sentences

In [5]:
# Function to create a dataframe from sentences
def create_dataframe(sentences, id_prefix):
    df = pd.DataFrame({'text': sentences})
    df['id'] = id_prefix + df.reset_index().index.astype(str)
    df = df[['id', 'text']]
    return df

In [6]:
# Function to count tokens in a text
def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    return len(encoding.encode(text))

In [7]:
# Function to get data from an input source
def get_data(input_source):
    try:
        if input_source.endswith('.jsonl'):
            response = requests.get(input_source)
            response.raise_for_status()
            data = [json.loads(line) for line in response.text.splitlines()]
        elif input_source.endswith('.csv'):
            response = requests.get(input_source)
            response.raise_for_status()
            data = [row for row in csv.DictReader(response.text.splitlines())]
        else:
            # Assuming the input source is a Google Sheets URL
            scope = ['https://spreadsheets.google.com/feeds',
                     'https://www.googleapis.com/auth/drive']
            creds = ServiceAccountCredentials.from_json_keyfile_name('rapid-agent-418714-5dd18281c337.json', scope)
            client = gspread.authorize(creds)
            sheet = client.open("Data_Blend").sheet1
            data = sheet.get_all_records()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {input_source}: {e}")
        return None

    return data

In [8]:
def process_all_files(variables):
    dfs = []
    for i in range(len(variables)):
        input_source = variables['input_source'][i]
        begin_extraction = variables['begin_extraction'][i]
        id_prefix = variables['id_prefix'][i]
        data = get_data(input_source)
        if data:
            sentences = extract_sentences(data, begin_extraction)
            df = create_dataframe(sentences, id_prefix)
            df['token_count'] = df['text'].apply(count_tokens)
            dfs.append(df)
    all_data_df = pd.concat(dfs, ignore_index=True)
    return all_data_df

In [9]:
# Function to scrape Wikipedia pages
def scrape_wikipedia_pages(wikipedia_sources):
    data = []
    for i, row in wikipedia_sources.iterrows():
        page = requests.get(row['input_source'])
        soup = BeautifulSoup(page.content, 'html.parser')
        for i in range(len(soup.find_all('p'))):
            text = soup.find_all('p')[i].get_text()
            text = text.replace('<p>', '').replace('</p>', '')
            text = text.replace('<a href="', '').replace('">', '')
            text = text.replace('</a>', '')
            id = row['id_prefix'] + str(i)
            data.append({'id': id, 'text': text})
    df = pd.DataFrame(data)
    df['token_count'] = df['text'].apply(count_tokens)
    return df

In [10]:
# Function to extract passages from the Bible
def extract_bible_passages(df, books_of_bible):
    bible = WebExtractor(translation='nlt')
    data = []
    for index, row in df.iterrows():
        if row['input_source'] not in books_of_bible:
            continue
        for chapter in range(int(row['begin_extraction']), int(row['end_before']) + 1):
            verse = 1
            while True:
                try:
                    passage = bible.get_passage(row['input_source'], chapter, verse)
                    passage = re.sub(r'[\u00B9\u00B2\u00B3\u2070-\u209F]', '', passage)
                    passage = re.sub(r'[^\w\s,;:.?!]', '', passage)
                    id = row['input_source'] + str(chapter).zfill(3) + str(verse).zfill(2)
                    data.append({'id': id, 'text': passage})
                    verse += 1
                except Exception:
                    break
    df = pd.DataFrame(data)
    df['token_count'] = df['text'].apply(count_tokens)
    return df

In [12]:
# Main function
if __name__ == '__main__':
    scope = ['https://spreadsheets.google.com/feeds',
             'https://www.googleapis.com/auth/drive']
    creds = ServiceAccountCredentials.from_json_keyfile_name('rapid-agent-418714-5dd18281c337.json', scope)
    client = gspread.authorize(creds)
    sheet = client.open("Data_Blend").worksheet("datasets")
    variables = pd.DataFrame(sheet.get_all_records())

    books_of_bible = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation']

    all_data_df = process_all_files(variables)
    wikipedia_sources = variables[variables['input_source'].str.startswith('https://en.wikipedia.org/wiki/')]
    wikipedia_df = scrape_wikipedia_pages(wikipedia_sources)
    bible_df = extract_bible_passages(variables, books_of_bible)

    final_df = pd.concat([all_data_df, wikipedia_df, bible_df], ignore_index=True)
    final_df.to_csv('all_data.csv', index=False)

Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not 

KeyboardInterrupt: 

In [None]:
# Read variables from Google Sheets
"""scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('rapid-agent-418714-5dd18281c337.json', scope)
client = gspread.authorize(creds)
sheet = client.open("Data_Blend").worksheet("datasets")
variables = pd.DataFrame(sheet.get_all_records())

In [None]:
# Loop through each row in the Google Sheets
"""for i in range(len(variables)):
    input_source = variables['input_source'][i]
    output_file = variables['output_file'][i]
    begin_extraction = variables['begin_extraction'][i]
    id_prefix = variables['id_prefix'][i]
    data = get_data(input_source)
    if data:
        sentences = extract_sentences(data, begin_extraction)
        df = create_dataframe(sentences, id_prefix)
        df['token_count'] = df['text'].apply(count_tokens)
        df.to_csv(output_file, index=False)

Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not found in data item. Skipping this item.
Key 'p' not 

In [None]:
"""def process_file(input_source, output_file, extraction, id_prefix):
    data = get_data(input_source)
    sentences = extract_sentences(data, extraction)
    df = create_dataframe(sentences, id_prefix)
    df['token_count'] = df['text'].apply(count_tokens)
    df.to_csv(output_file, index=False)