In [1]:
import json
import pandas as pd
from datasets import load_dataset

In [2]:
# Methods
def make_df(eq, col_name):
    df_ = df[df[col_name] == eq].reset_index(drop=True)
    return df_



# Codes

In [77]:
# Pull n examples from a huggingface streaming dataset
def sample_dataset(dataset_name, n=5000):
    dataset = load_dataset(dataset_name, split='train', streaming=True)

    samples = []
    langs = []
    # Boxes up the required fields from the streaming dataset
    for example in dataset:
        samples.append(example['code'])
        langs.append(example['language'])
        if len(samples) >= n:
            break

    return samples, langs

samples, langs = sample_dataset('codeparrot/github-code')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [78]:
df = pd.DataFrame({'text':samples,'label':langs})

# Take 25 from the top 12 languages
variety = 12
sample_size = 25
langs = df['label'].value_counts(ascending=False).index[:variety]

dfs = [make_df(x, 'label') for x in langs]
samples = pd.concat([ dfs[x].sample(sample_size) for x in range(variety) ]).reset_index(drop=True)
samples.to_csv('../data/clean/misc/code.csv', index=False)
display(samples)

Unnamed: 0,text,label
0,"import {keccak256, bufferToHex} from ""ethereum...",JavaScript
1,"(function (angular) {\n ""use strict"";\n ...",JavaScript
2,"define('exports@*', [], function(require, expo...",JavaScript
3,"window.ImageViewer = function(url, alt, title)...",JavaScript
4,"import React, { Component } from 'react';\n\nc...",JavaScript
...,...,...
295,body {\n padding-top: 65px;\n}\n\n#footer {\n...,CSS
296,"@charset ""utf-8"";\n/* CSS Document */\n/* ----...",CSS
297,/*!\n * # Semantic UI 2.1.6 - Message\n * http...,CSS
298,.home-layout {\n padding-top: 50px;\n paddin...,CSS


# Med Transcripts

In [75]:
df = pd.read_csv('../data/raw/nonfiction/med_trans.csv', index_col='Unnamed: 0')
len(df)

4999

In [76]:
df = df.dropna()
df = df.drop_duplicates()
df['medical_specialty'] = [x.replace(" ", "") for x in df['medical_specialty']]
df = df[['transcription','medical_specialty']]
df = df.rename(columns={'transcription':'text','medical_specialty':'label'})
df = df.sample(300).reset_index(drop=True)
df.to_csv('../data/clean/nonfiction/med_trans_clean.csv', index=False)
display(df)

Unnamed: 0,text,label
0,HISTORY OF PRESENT ILLNESS: This is a 91-year...,EmergencyRoomReports
1,"PREOPERATIVE DIAGNOSIS:, Right breast mass wi...",Surgery
2,"DIAGNOSES PROBLEMS:,1. Orthostatic hypotensio...",OfficeNotes
3,"CC:, Falling.,HX:, This 67y/o RHF was diagnose...",Radiology
4,"PREOPERATIVE DIAGNOSIS:, Left renal mass, 5 c...",Surgery
...,...,...
295,"PRELIMINARY DIAGNOSES:,1. Contusion of the fr...",Neurology
296,"TECHNIQUE: , Sequential axial CT images were o...",Radiology
297,"PREOPERATIVE DIAGNOSES: ,1. Posttraumatic na...",ENT-Otolaryngology
298,"PREOPERATIVE DIAGNOSES:,1. Plantar flex third...",Surgery


# Movie Summaries

In [68]:
# DOWNLOAD NEEDED: Uncomment after you download the data

df = pd.read_csv('../data/raw/fiction/movie_sums.csv')
len(df)

34886

In [70]:
# Take 25 from the top 12 labeled genres
variety = 12
sample_size = 25
genres = df['Genre'].value_counts().index[1:variety+1]

dfs = [make_df(x, 'Genre') for x in genres]
samples = pd.concat([ dfs[x].sample(sample_size) for x in range(variety) ]).reset_index(drop=True)
samples = samples[['Plot','Genre']]
samples = samples.rename(columns={'Plot':'text','Genre':'label'})
samples.to_csv('../data/clean/fiction/movie_summs_clean.csv', index=False)
display(samples)

Unnamed: 0,text,label
0,U.S. Marines Lance Corporal Harold Dawson and ...,drama
1,The movie begins at present day with Saket Ram...,drama
2,The father escaped the Soviet invasion of Buda...,drama
3,A young Lakota Sioux (Lou Diamond Phillips) is...,drama
4,Heinrich Faust (Johannes Zeiler) is driven by ...,drama
...,...,...
295,Marcy Tizard (Janeane Garofalo) is assistant t...,romantic comedy
296,Vasu (Santhanam) and Saravanan (Arya) are best...,romantic comedy
297,Avinash (Allari Naresh) is a small-time crook ...,romantic comedy
298,The young and beautiful Suzanne O'Neill (Barba...,romantic comedy


# SAT Questions

In [73]:
df = pd.read_csv('../data/raw/interrogative/sat_qstns.csv')
len(df)

1380

In [74]:
txts = []
texts = df['prompt'].dropna()
txts += [x for x in texts if '?' in x]
txts = pd.DataFrame({'text':txts})
txts_smpled = txts.sample(300).reset_index(drop=True)
txts_smpled.to_csv('../data/clean/interrogative/sat_qstns_clean.csv', index=False)
display(txts_smpled)

Unnamed: 0,text
0,Which of the following is a nickname for the g...
1,Which Mesoamerican culture used chinampas to i...
2,The thin disguise of 'equal' accommodations fo...
3,"Who accused John Quincy Adams of making a ""cor..."
4,Which quote below is from Thomas Jefferson's f...
...,...
295,What was Sigmund Freud's intent with his break...
296,"In The American Crisis, no. 1, Thomas Paine re..."
297,How did the colonists react to the passage of ...
298,"It had been, to say the least, an interesting ..."


# Case Law

In [72]:
# Set up streaming dataset object
dataset_stream = load_dataset("TeraflopAI/Caselaw_Access_Project", split="train", streaming=True)
# Initialize vars
len_exam = 0
txts = []
cum_len = 0

# Iterate through streaming dataset
for example in dataset_stream:
    # Check for less than 300 examples
    if len_exam <= 300:
        # Split up current example by new line
        split_txt = example['text'].split("\n")
        # Start current example counter
        counter = 0
        # Start current example placeholder
        placeholder = 1

        # Iterate through the new line splits of the current example
        for x in range(len(split_txt)):
            # Save current line
            row = split_txt[x]
            # Save its length
            char_len = len(row)
            # If our current new_txt is less than 1500 characters:
            if cum_len < 1500:
                # Then take the current example in the range of counter -> placeholder
                new_txt = "\n".join(split_txt[counter:placeholder])
                # Reassess the sumulative new_text length
                cum_len += char_len        
            # If new_txt is over 1500 characters
            else:
                # Then store new_txt into txts
                txts.append(new_txt)
                # Increase counter to placeholder
                counter = placeholder
                # Reset cumulative length
                cum_len = 0
            # Update placeholder
            placeholder += 1
        # Update length of txts
        len_exam = len(txts)
    # Otherwise Break
    else:
        break
    
# Store new texts into a DataFrame
df = pd.DataFrame({"text":txts})
# Cleanup
df = df.dropna()
df = df.drop_duplicates()
df = df.loc[:299]
# Save
df.to_csv('../data/clean/nonfiction/case_law.csv', index=False)
display(df)

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Unnamed: 0,text
0,"OPINION\nRABINOWITZ, Justice.\nI. INTRODUCTION..."
1,The data summary stated that there was a layer...
2,NCI contends that Green's visit was scheduled ...
3,"In April 1993, NCI moved for partial summary j..."
4,In Brinderson Corp. v. Hampton Rds. San. Dist....
...,...
295,Gargan and Knix dealt with notarized statement...
296,Harrison's conviction is AFFIRMED.\n. In Garga...
297,"OPINION\nEASTAUGH, Justice.\nWe must here cons..."
298,"A hearing on Huston's August 21, 1991, applica..."


# Spam Text

In [63]:
# DOWNLOAD NEEDED: Uncomment after you download the data

df = pd.read_csv('../data/raw/colloquial/spam_email.csv')

In [64]:
emails = make_df(0, 'label')
print(f" OG Emails Count: {len(emails)}")
emls_sampled = emails.sample(300).reset_index(drop=True)
eml_spam = make_df(1, 'label')
eml_spam_sampled = eml_spam.sample(150).reset_index(drop=True)

 OG Emails Count: 39538


In [65]:
df = pd.read_csv('../data/raw/colloquial/spam_text.csv', encoding='ISO-8859-1')
df = df[['v1','v2']].rename(columns={'v1':'label','v2':'text'})

In [66]:
texts = make_df('ham', 'label')
print(f" OG Texts Count: {len(texts)}")
txts_sampled = texts.sample(300).reset_index(drop=True)
txt_spam = make_df('spam', 'label')
print(f" OG Spam Txt Count: {len(txt_spam)+len(eml_spam)}")
txt_spam_sampled = txt_spam.sample(150).reset_index(drop=True)

 OG Texts Count: 4825
 OG Spam Txt Count: 44657


In [67]:
cc_spam = pd.concat([eml_spam_sampled, txt_spam_sampled]).reset_index(drop=True)
key = {1:'email','spam':'text'}
cc_spam['label'] = cc_spam['label'].map(key)
cc_spam.to_csv('../data/clean/colloquial/spam_msgs_clean.csv', index=False)
txts_sampled.to_csv('../data/clean/colloquial/txt_msgs_clean.csv', index=False)
emls_sampled.to_csv('../data/clean/colloquial/email_msgs_clean.csv', index=False)
display(cc_spam)

Unnamed: 0,label,text
0,email,special summer offer from canadianpharmacy esc...
1,email,legalrxmedications is a international online d...
2,email,victoria's secret r five hundred dollar gift c...
3,email,it is going to be huge search for asvpcurrent ...
4,email,products and great service all rolled up into ...
...,...,...
295,text,This message is free. Welcome to the new & imp...
296,text,"Free Msg: get Gnarls Barkleys \Crazy\"" rington..."
297,text,Dear U've been invited to XCHAT. This is our f...
298,text,Thanks 4 your continued support Your question ...


# Bible Prompts
*These data were generated for this event, therefore they're preprocessed entirely

In [91]:
df = pd.read_csv('../data/raw/declarative/BiblePrompts.csv', index_col='Unnamed: 0')
len(df)

480

In [92]:
df = df.sample(300).reset_index(drop=True)
df.to_csv('../data/clean/declarative/bible_clean.csv', index=False)
display(df)

Unnamed: 0,original_text,rewrite_prompt,rewritten_text
0,A strong wind was blowing and the waters grew ...,What is this about?,The text describes a strong wind and rough wat...
1,"Do not say, Why were the old days better than...",Rewrite this in the style of Shakespeare,"Sure, here's the rewritten text in the style o..."
2,"Then he told them, Now draw some out and take...",Please rephrase this to make it more clear,"Sure, here's the rephrased version:\r\n\r\nThe..."
3,In this meaningless life of mine I have seen ...,Can you turn this into a riddle?Rewrite this a...,**Riddle:**\r\n\r\nWhat is the fate of the rig...
4,How can you believe since you accept glory fro...,Convert this into a sermon,**Sermon: The Glory of God vs. the Glory of Ma...
...,...,...,...
295,"When a Samaritan woman came to draw water, Je...",Make this rhyme.,"Sure, here's the rhyme:\r\n\r\nWhen a Samarita..."
296,Then John gave this testimony: I saw the Spiri...,Change the text so it means the oposite,"Sure, here is the text changed to mean the opp..."
297,"If either of them falls down,\r\n one can ...",Can you turn this into a riddle?Rewrite this a...,**Riddle:**\r\n\r\nTwo people are standing sid...
298,"On the last and greatest day of the festival,...",Explain why this is wrong.,The text is from the Gospel of John (John 6:35...


In [94]:
rw_df = df['rewrite_prompt'].drop_duplicates()
rw_df.to_csv('../prompts/bible-prompts.csv', index=False)
display(rw_df)

0                                   What is this about?
1              Rewrite this in the style of Shakespeare
2            Please rephrase this to make it more clear
3     Can you turn this into a riddle?Rewrite this a...
4                            Convert this into a sermon
6                            Explain why this is wrong.
8               Change the text so it means the oposite
11                     Convert this into modern english
12                      Convert this into a sea shanty.
17              Rewrite this so every fifth word is dog
19                  Convert this into an engaging story
25                         Rewrite this like Shakspeare
27                              Turn this into a riddle
31                    Explain this to me like I'm five.
53                                 Make this about cats
55                      Create a rap song from the text
63                                     Make this rhyme.
71                        Convert this into old 

# Essays
*These data were generated for this event, therefore they're preprocessed entirely

In [125]:
df = pd.read_csv('../data/raw/fiction/essays.csv')
len(df)

2166

In [126]:
df = df[['original_text','rewrite_prompt','rewritten_text']]
rws = df['rewrite_prompt']
df['rewrite_prompt'] = pd.DataFrame({'prompts':[x.split(':')[1] for x in rws]})
df = df.sample(300).reset_index(drop=True)
df.to_csv('../data/clean/declarative/essays_clean.csv', index=False)
display(df)

Unnamed: 0,original_text,rewrite_prompt,rewritten_text
0,It was a typical Saturday night for me. My bud...,Use the bold and brash language of a Viking w...,"It was a typical Saturday night for me, my ber..."
1,"`` Freeze!'' six of us yelled in unison, and m...","Write in the direct, authoritative tone of a ...","""Attention all, cease your activities and free..."
2,"“ How does defeat taste, power rangers? I hope...","Write with Orwell's clarity and foresight, fo...",The echoes of Rita Repulsa's laughter reverber...
3,"`` Do n't think of it as dying,'' he said, `` ...",Rewrite this in the style of a 1940s film noi...,"""Well, you ain't gonna believe what happened, ..."
4,"Cancer, old age, suicide. There are many ways ...",Use the lyrical and storytelling style of a m...,"In this tale, I weave a story of a fateful enc..."
...,...,...,...
295,Her gaze burned me beyond a burn \n \n Through...,"Adopt King's knack for storytelling, blending...","The stench of decay lingered in the air, a sym..."
296,The Adventures of I-Can't-Believe-That-Just-Ha...,Use the slick and tough language of a 1920s g...,"Listen up, cuz I'm gonna tell you a story that..."
297,"He stood there, looking as his own grave. \n \...",Use the inventive and curious tone of a Victo...,"In the quaint Victorian era, where innovation ..."
298,As Gaius Vulpinus Rufinus rapidly tapped at hi...,Recast the essay with the optimism and specif...,"As the clock glowed 0018sc, Gaius Vulpinus Ruf..."


In [127]:
rw_df = df['rewrite_prompt'].drop_duplicates()
rw_df.to_csv('../prompts/essay-prompts.csv', index=False)
display(rw_df)

0       Use the bold and brash language of a Viking w...
1       Write in the direct, authoritative tone of a ...
2       Write with Orwell's clarity and foresight, fo...
3       Rewrite this in the style of a 1940s film noi...
4       Use the lyrical and storytelling style of a m...
                             ...                        
244     Rewrite the text as a fairy tale, complete wi...
248     Recast the essay as a tale from the American ...
267     Embrace the whimsical and rhyming style of Dr...
273     Write with the cunning and secretive style of...
280     Write as though you were a medieval peasant, ...
Name: rewrite_prompt, Length: 100, dtype: object

# Alfred Hitchcock

In [166]:
df = pd.read_csv('../data/raw/fiction/WikiAH.csv')
len(df)

153

In [167]:
df = df[['text','token_count']].rename(columns={'token_count':'label'})
df.to_csv('../data/clean/fiction/wikiAH_clean.csv')
display(df)

Unnamed: 0,text,label
0,Chicken farmer Arthur Williams (Harvey) deligh...,152
1,When convicted robber Jackie Blake (Hickman) i...,157
2,Inspector Benson (Moore) is tasked with preven...,158
3,When Jeff Jensen (Persoff) is non-fatally atta...,158
4,"While traveling on a plane, war correspondent ...",159
...,...,...
148,Aware that their friend Professor Rankin's (Em...,296
149,James Barrett (Gaines) busily plans for his ab...,296
150,Hugo (De Wilde) is a mentally challenged boy w...,296
151,Private investigator Cutter (Hanmer) visits mo...,297


# Hella Swag

In [164]:
df = pd.read_csv('../data/raw/misc/hellaslog.csv')
len(df)

6000

In [165]:
df = df.sample(300).reset_index(drop=True)
df = pd.DataFrame({'text':df['sentences']})
df.to_csv('../data/clean/misc/hellaSwag_clean.csv', index=False)
display(df)

Unnamed: 0,text
0,"A small dog is in a tub, and woman uses a spra..."
1,They take turns inhaling and exhaling the smok...
2,The man kneels down and shoes the camera the b...
3,Two men are sitting on a table plaing rock pap...
4,"Two girls prepares to jump rope, then the girl..."
...,...
295,Men run to cover behind the protectors while s...
296,A woman is seen standing with a tied up dog ou...
297,A group of people are in the backyard of a hou...
298,Kids are shown in side of a gym playing indoor...


# Psalms

In [190]:
df = pd.read_csv('../data/raw/prose/Psalm100-150NLT.csv')
df = df['text'].drop_duplicates()
len(df)

807

In [191]:
text = text.sample(300).reset_index(drop=True)
df = pd.DataFrame({'text':text})
df.to_csv('../data/clean/prose/psalms_clean.csv', index=False)
display(df)

Unnamed: 0,text
0,"Here is the ocean, vast and wide,\n teemin..."
1,"Shout with joy to the Lord, all the earth!"
2,I have chosen to be faithful;\n I have det...
3,"O Lord, what a variety of things you have mad..."
4,"Praise the Lord, the God of Israel,\n who ..."
...,...
295,He turned the rock into a pool of water;\n ...
296,My daily task will be to ferret out the wicke...
297,"I will praise you every day;\n yes, I will..."
298,"Lift your hands toward the sanctuary,\n an..."
