In [1]:
import json
import pandas as pd
from datasets import load_dataset

In [2]:
# Methods
def make_df(eq, col_name):
    df_ = df[df[col_name] == eq].reset_index(drop=True)
    return df_



# Codes

In [3]:
# Pull n examples from a huggingface streaming dataset
def sample_dataset(dataset_name, n=5000):
    dataset = load_dataset(dataset_name, split='train', streaming=True)

    samples = []
    langs = []
    # Boxes up the required fields from the streaming dataset
    for example in dataset:
        samples.append(example['code'])
        langs.append(example['language'])
        if len(samples) >= n:
            break

    return samples, langs

samples, langs = sample_dataset('codeparrot/github-code')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
df = pd.DataFrame({'text':samples,'label':langs})

# Take 25 from the top 12 languages
variety = 12
sample_size = 25
langs = df['label'].value_counts(ascending=False).index[:variety]

dfs = [make_df(x, 'label') for x in langs]
samples = pd.concat([ dfs[x].sample(sample_size) for x in range(variety) ]).reset_index(drop=True)
samples.to_csv('../data/clean/misc/code.csv', index=False)
display(samples)

Unnamed: 0,text,label
0,#!/usr/bin/env node\n\n//\n// cli.js\n//\n// C...,JavaScript
1,"window.ImageViewer = function(url, alt, title)...",JavaScript
2,'use strict';\r\n\r\nmodule.exports = {\r\n ...,JavaScript
3,// -------------------------------------------...,JavaScript
4,"const {\n createServer,\n plugins: { queryPa...",JavaScript
...,...,...
295,"@import url(""//fonts.googleapis.com/css?family...",CSS
296,.vertical-center {\n min-height: 100%; /* Fa...,CSS
297,body {\n padding-top: 65px;\n}\n\n#footer {\n...,CSS
298,.home-layout {\n padding-top: 50px;\n paddin...,CSS


# Med Transcripts

In [5]:
df = pd.read_csv('../data/raw/nonfiction/med_trans.csv', index_col='Unnamed: 0')
len(df)

4999

In [6]:
df = df.dropna()
df = df.drop_duplicates()
df['medical_specialty'] = [x.replace(" ", "") for x in df['medical_specialty']]
df = df[['transcription','medical_specialty']]
df = df.rename(columns={'transcription':'text','medical_specialty':'label'})
df = df.sample(300).reset_index(drop=True)
df.to_csv('../data/clean/nonfiction/med_trans_clean.csv', index=False)
display(df)

Unnamed: 0,text,label
0,"HISTORY OF PRESENT ILLNESS:, Ms. Connor is a ...",SOAP/Chart/ProgressNotes
1,"EYES: , The conjunctivae are clear. The lids ...",OfficeNotes
2,"PREOPERATIVE DIAGNOSES:,1. Enlarged fibroid u...",Surgery
3,"HISTORY OF PRESENT ILLNESS:, The patient pres...",SOAP/Chart/ProgressNotes
4,"HX: ,This 46y/o RHM with HTN was well until 2 ...",Neurology
...,...,...
295,"COCCYGEAL INJECTION,PROCEDURE:,: Informed con...",PainManagement
296,"SUBJECTIVE:, This 3-year-old male is brought ...",GeneralMedicine
297,"ADENOIDECTOMY,PROCEDURE:, The patient was bro...",ENT-Otolaryngology
298,"EXAM: , Barium enema.,CLINICAL HISTORY: , A 4-...",Gastroenterology


# Movie Summaries

In [7]:
# DOWNLOAD NEEDED: Uncomment after you download the data

df = pd.read_csv('../data/raw/fiction/movie_sums.csv')
len(df)

34886

In [8]:
# Take 25 from the top 12 labeled genres
variety = 12
sample_size = 25
genres = df['Genre'].value_counts().index[1:variety+1]

dfs = [make_df(x, 'Genre') for x in genres]
samples = pd.concat([ dfs[x].sample(sample_size) for x in range(variety) ]).reset_index(drop=True)
samples = samples[['Plot','Genre']]
samples = samples.rename(columns={'Plot':'text','Genre':'label'})
samples.to_csv('../data/clean/fiction/movie_summs_clean.csv', index=False)
display(samples)

Unnamed: 0,text,label
0,Dukhu Mia (Farooque) is a stage performer and ...,drama
1,Vera Drake (Imelda Staunton) is devoted to her...,drama
2,Ann Lemp Borden (Priscilla Lane) has been rece...,drama
3,Annie O'Farrell (based on Anne McDonald) is a ...,drama
4,"On the morning of September 11, 2001, a messen...",drama
...,...,...
295,Journalism instructor Erica Stone (Doris Day) ...,romantic comedy
296,Jimmie Shannon (Chris O'Donnell) wants to be a...,romantic comedy
297,The movie follows a man's search for perfectio...,romantic comedy
298,Kiki (Norma Talmadge) ekes out a living sellin...,romantic comedy


# SAT Questions

In [9]:
df = pd.read_csv('../data/raw/interrogative/sat_qstns.csv')
len(df)

1380

In [10]:
txts = []
texts = df['prompt'].dropna()
txts += [x for x in texts if '?' in x]
txts = pd.DataFrame({'text':txts})
txts_smpled = txts.sample(300).reset_index(drop=True)
txts_smpled.to_csv('../data/clean/interrogative/sat_qstns_clean.csv', index=False)
display(txts_smpled)

Unnamed: 0,text
0,"Which of the following machines, invented in 1..."
1,Which of the following statements about 18th-c...
2,Which of the following statements from the Sen...
3,"Which of the following organizations, born in ..."
4,"By the year 1100 CE, the strongest Norman infl..."
...,...
295,"One of the best ways to cope with (fear), is t..."
296,Mumbai was a world leader in which industry du...
297,What did the Gulf of Tonkin Resolution state?
298,Which of the 13 British colonies was founded o...


# Case Law

In [11]:
# Set up streaming dataset object
dataset_stream = load_dataset("TeraflopAI/Caselaw_Access_Project", split="train", streaming=True)
# Initialize vars
len_exam = 0
txts = []
cum_len = 0

# Iterate through streaming dataset
for example in dataset_stream:
    # Check for less than 300 examples
    if len_exam <= 300:
        # Split up current example by new line
        split_txt = example['text'].split("\n")
        # Start current example counter
        counter = 0
        # Start current example placeholder
        placeholder = 1

        # Iterate through the new line splits of the current example
        for x in range(len(split_txt)):
            # Save current line
            row = split_txt[x]
            # Save its length
            char_len = len(row)
            # If our current new_txt is less than 1500 characters:
            if cum_len < 1500:
                # Then take the current example in the range of counter -> placeholder
                new_txt = "\n".join(split_txt[counter:placeholder])
                # Reassess the sumulative new_text length
                cum_len += char_len        
            # If new_txt is over 1500 characters
            else:
                # Then store new_txt into txts
                txts.append(new_txt)
                # Increase counter to placeholder
                counter = placeholder
                # Reset cumulative length
                cum_len = 0
            # Update placeholder
            placeholder += 1
        # Update length of txts
        len_exam = len(txts)
    # Otherwise Break
    else:
        break
    
# Store new texts into a DataFrame
df = pd.DataFrame({"text":txts})
# Cleanup
df = df.dropna()
df = df.drop_duplicates()
df = df.loc[:299]
# Save
df.to_csv('../data/clean/nonfiction/case_law.csv', index=False)
display(df)

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Unnamed: 0,text
0,"OPINION\nRABINOWITZ, Justice.\nI. INTRODUCTION..."
1,The data summary stated that there was a layer...
2,NCI contends that Green's visit was scheduled ...
3,"In April 1993, NCI moved for partial summary j..."
4,In Brinderson Corp. v. Hampton Rds. San. Dist....
...,...
295,Gargan and Knix dealt with notarized statement...
296,Harrison's conviction is AFFIRMED.\n. In Garga...
297,"OPINION\nEASTAUGH, Justice.\nWe must here cons..."
298,"A hearing on Huston's August 21, 1991, applica..."


# Spam Text

In [12]:
# DOWNLOAD NEEDED: Uncomment after you download the data

df = pd.read_csv('../data/raw/colloquial/spam_email.csv')

In [13]:
emails = make_df(0, 'label')
print(f" OG Emails Count: {len(emails)}")
emls_sampled = emails.sample(300).reset_index(drop=True)
eml_spam = make_df(1, 'label')
eml_spam_sampled = eml_spam.sample(150).reset_index(drop=True)

 OG Emails Count: 39538


In [14]:
df = pd.read_csv('../data/raw/colloquial/spam_text.csv', encoding='ISO-8859-1')
df = df[['v1','v2']].rename(columns={'v1':'label','v2':'text'})

In [15]:
texts = make_df('ham', 'label')
print(f" OG Texts Count: {len(texts)}")
txts_sampled = texts.sample(300).reset_index(drop=True)
txt_spam = make_df('spam', 'label')
print(f" OG Spam Txt Count: {len(txt_spam)+len(eml_spam)}")
txt_spam_sampled = txt_spam.sample(150).reset_index(drop=True)

 OG Texts Count: 4825
 OG Spam Txt Count: 44657


In [16]:
cc_spam = pd.concat([eml_spam_sampled, txt_spam_sampled]).reset_index(drop=True)
key = {1:'email','spam':'text'}
cc_spam['label'] = cc_spam['label'].map(key)
cc_spam.to_csv('../data/clean/colloquial/spam_msgs_clean.csv', index=False)
txts_sampled.to_csv('../data/clean/colloquial/txt_msgs_clean.csv', index=False)
emls_sampled.to_csv('../data/clean/colloquial/email_msgs_clean.csv', index=False)
display(cc_spam)

Unnamed: 0,label,text
0,email,subliminal is the key insists richard buckner...
1,email,home owners great options available for refi a...
2,email,makro clueim alli steps runtime selected freez...
3,email,gather saturday hit the the that union in isra...
4,email,http hotsolution escapelong pills x escapenumb...
...,...,...
295,text,Message Important information for O2 user. Tod...
296,text,Congrats! 1 year special cinema pass for 2 is ...
297,text,December only! Had your mobile 11mths+? You ar...
298,text,Congratulations ur awarded either å£500 of CD ...


# Bible Prompts
*These data were generated for this event, therefore they're preprocessed entirely

In [17]:
df = pd.read_csv('../data/raw/declarative/BiblePrompts.csv', index_col='Unnamed: 0')
len(df)

480

In [18]:
df = df.sample(300).reset_index(drop=True)
df.to_csv('../data/clean/declarative/bible_clean.csv', index=False)
display(df)

Unnamed: 0,original_text,rewrite_prompt,rewritten_text
0,"I said to myself,\r\nGod will bring into judg...",Rewrite this like Shakspeare,"Sure, here's the rewritten text like a preache..."
1,No one has ever gone into heaven except the on...,Convert this into a sea shanty.,**Verse 1:**\r\nThere's a tale of a tale to te...
2,"Jesus said, I am with you for only a short ti...",Change the text so it means the oposite,"Sure, here's the opposite:\r\n\r\nJesus said, ..."
3,"Meanwhile his disciples urged him, Rabbi, eat...",Convert this into old english,"Sure, here is the converted text into old engl..."
4,We have come to believe and to know that you a...,What is this about?,The text is a quote from the Gospel of John 6:...
...,...,...,...
295,"What did he mean when he said, You will look ...",Turn this into a riddle,"Sure, here's the riddle:\r\n\r\nI'm a mystery ..."
296,This too is a grievous evil:\r\nAs everyone c...,Explain this to me like I'm five.,"Sure, here's the explanation for you:\r\n\r\nI..."
297,"children born not of natural descent, nor of ...",What is this about?,The text refers to children born to believers ...
298,and come outthose who have done what is good ...,Rewrite this so every fifth word is dog,"Sure, here is the rewritten text:\r\n\r\nThose..."


In [19]:
rw_df = df['rewrite_prompt'].drop_duplicates()
rw_df.to_csv('../prompts/bible-prompts.csv', index=False)
display(rw_df)

0                          Rewrite this like Shakspeare
1                       Convert this into a sea shanty.
2               Change the text so it means the oposite
3                         Convert this into old english
4                                   What is this about?
5                   Convert this into an engaging story
6               Rewrite this so every fifth word is dog
9                            Convert this into a sermon
10    Can you turn this into a riddle?Rewrite this a...
11                                     Make this rhyme.
12                                 Make this about cats
14             Rewrite this in the style of Shakespeare
16                      Create a rap song from the text
25                              Turn this into a riddle
29                           Explain why this is wrong.
37                    Explain this to me like I'm five.
47                     Convert this into modern english
57           Please rephrase this to make it mor

# Essays
*These data were generated for this event, therefore they're preprocessed entirely

In [20]:
df = pd.read_csv('../data/raw/fiction/essays.csv')
len(df)

2166

In [21]:
df = df[['original_text','rewrite_prompt','rewritten_text']]
rws = df['rewrite_prompt']
df['rewrite_prompt'] = pd.DataFrame({'prompts':[x.split(':')[1] for x in rws]})
df = df.sample(300).reset_index(drop=True)
df.to_csv('../data/clean/fiction/essays_clean.csv', index=False)
display(df)

Unnamed: 0,original_text,rewrite_prompt,rewritten_text
0,They used to tell me that there is no such thi...,"Rewrite the text using modern slang, abbrevia...","OMG, right? Magic is real, dude. It's not actu..."
1,"`` I swear I did n't do it,'' the man sobbed, ...",Transform the text to reflect the elegance an...,The grand halls of Pemberley Manor echoed with...
2,The man from the council stared blankly the an...,"Emulate the writing of a Renaissance scholar,...","The councilman, his countenance unmoving, star..."
3,"The Moon, a Crack, and Conspiracies \n \n \n (...",Adopt the refined and eloquent language of a ...,"The celestial canvas above us, adorned with st..."
4,`` The M.Modification files were destroyed for...,Use the bold and brash language of a Viking w...,The M.Modification files were destroyed for a ...
...,...,...,...
295,Terra was always considered the land of misfit...,Use the precise and excited tone of an enthus...,"Terra, the enigmatic land of misfits, was an e..."
296,A scruff stocky man walks towards the table. H...,Use the confident and jargon-heavy language o...,"A stern, muscular figure strides confidently t..."
297,When did I fall in love with her? As we eat ou...,"Infuse the essay with the rambling, disjointe...","When I fall in toob the drunkerness of love, I..."
298,My whole life I ’ ve focused on becoming a ten...,Write with the cunning and secretive style of...,My life has been a symphony of sweat and deter...


In [22]:
rw_df = df['rewrite_prompt'].drop_duplicates()
rw_df.to_csv('../prompts/essay-prompts.csv', index=False)
display(rw_df)

0       Rewrite the text using modern slang, abbrevia...
1       Transform the text to reflect the elegance an...
2       Emulate the writing of a Renaissance scholar,...
3       Adopt the refined and eloquent language of a ...
4       Use the bold and brash language of a Viking w...
                             ...                        
238     Adopt the formal and polite manner of speech ...
261     Infuse the essay with the lyrical and profoun...
267     Write with the inspiring and determined tone ...
270     Infuse the essay with the ironic, slang-heavy...
297     Infuse the essay with the rambling, disjointe...
Name: rewrite_prompt, Length: 103, dtype: object

# Alfred Hitchcock

In [23]:
df = pd.read_csv('../data/raw/fiction/WikiAH.csv')
len(df)

153

In [24]:
df = df[['text','token_count']].rename(columns={'token_count':'label'})
df.to_csv('../data/clean/fiction/wikiAH_clean.csv')
display(df)

Unnamed: 0,text,label
0,Chicken farmer Arthur Williams (Harvey) deligh...,152
1,When convicted robber Jackie Blake (Hickman) i...,157
2,Inspector Benson (Moore) is tasked with preven...,158
3,When Jeff Jensen (Persoff) is non-fatally atta...,158
4,"While traveling on a plane, war correspondent ...",159
...,...,...
148,Aware that their friend Professor Rankin's (Em...,296
149,James Barrett (Gaines) busily plans for his ab...,296
150,Hugo (De Wilde) is a mentally challenged boy w...,296
151,Private investigator Cutter (Hanmer) visits mo...,297


# Hella Swag

In [25]:
df = pd.read_csv('../data/raw/misc/hellaslog.csv')
len(df)

6000

In [26]:
df = df.sample(300).reset_index(drop=True)
df = pd.DataFrame({'text':df['sentences']})
df.to_csv('../data/clean/misc/hellaSwag_clean.csv', index=False)
display(df)

Unnamed: 0,text
0,The personal trainer is using an exercise bicy...
1,"A woman demonstrates how to do sit ups, in a g..."
2,A bench is shown in a foyer. A woman puts her ...
3,We see a person holding a contact lens. We see...
4,A woman is seen sitting in a wheelchair lookin...
...,...
295,Two girls are standing in roller skates.
296,Once the boy has finally done the flip success...
297,He puts the front tire on the bike. He puts th...
298,A person uses a vacuum cleaner the clean the c...


# Psalms

In [27]:
df = pd.read_csv('../data/raw/prose/Psalm100-150NLT.csv')
df = df['text'].drop_duplicates()
len(df)

807

In [29]:
text = df.sample(300).reset_index(drop=True)
df = pd.DataFrame({'text':text})
df.to_csv('../data/clean/prose/psalms_clean.csv', index=False)
display(df)

Unnamed: 0,text
0,For he broke down their prison gates of bronz...
1,Let the godly strike me!\n It will be a ki...
2,Praise the Lord!\nHow good to sing praises to...
3,"Your name, O Lord, endures forever;\n your..."
4,I thank you for answering my prayer\n and ...
...,...
295,I will keep my promises to the Lord\n in t...
296,"wild animals and all livestock,\n small sc..."
297,"He broke my strength in midlife,\n cutting..."
298,"All of your works will thank you, Lord,\n ..."
