In [1]:
import json
import pandas as pd
from datasets import load_dataset

In [2]:
# Methods
def make_df(eq, col_name):
    df_ = df[df[col_name] == eq].reset_index(drop=True)
    return df_



# Codes

In [6]:
# Pull n examples from a huggingface streaming dataset
def sample_dataset(dataset_name, n=5000):
    dataset = load_dataset(dataset_name, split='train', streaming=True)

    samples = []
    langs = []
    # Boxes up the required fields from the streaming dataset
    for example in dataset:
        samples.append(example['code'])
        langs.append(example['language'])
        if len(samples) >= n:
            break

    return samples, langs

samples, langs = sample_dataset('codeparrot/github-code')

In [7]:
df = pd.DataFrame({'samples':samples,'langs':langs})

# Take 25 from the top 12 languages
variety = 12
sample_size = 25
langs = df['langs'].value_counts(ascending=False).index[:variety]

dfs = [make_df(x, 'langs') for x in langs]
samples = pd.concat([ dfs[x].sample(sample_size) for x in range(variety) ]).reset_index(drop=True)
samples.to_csv('../data/clean/misc/code.csv', index=False)

# Med Transcripts

In [8]:
df = pd.read_csv('../data/raw/nonfiction/med_trans.csv', index_col='Unnamed: 0')
df = df.dropna()
df = df.drop_duplicates()
df['medical_specialty'] = [x.replace(" ", "") for x in df['medical_specialty']]
df = df[['transcription','medical_specialty']]
df.to_csv('../data/clean/nonfiction/med_trans_clean.csv', index=False)

# Movie Summaries

In [9]:
# DOWNLOAD NEEDED: Uncomment after you download the data

df = pd.read_csv('../data/raw/fiction/movie_sums.csv')

# Take 25 from the top 12 labeled genres
variety = 12
sample_size = 25
genres = df['Genre'].value_counts().index[1:variety+1]

dfs = [make_df(x, 'Genre') for x in genres]
samples = pd.concat([ dfs[x].sample(sample_size) for x in range(variety) ]).reset_index(drop=True)
samples = samples[['Plot','Genre']]
samples.to_csv('../data/clean/fiction/movie_summs_clean.csv', index=False)

In [10]:
len(df['Plot'][2])

436

# SAT Questions

In [11]:
df = pd.read_csv('../data/raw/interrogative/sat_qstns.csv')

In [12]:
df['subject'].value_counts()

subject
us_history       1108
world_history     272
Name: count, dtype: int64

# Case Law

In [152]:
# Set up streaming dataset object
dataset_stream = load_dataset("TeraflopAI/Caselaw_Access_Project", split="train", streaming=True)
# Initialize vars
len_exam = 0
txts = []

# Iterate through streaming dataset
for example in dataset_stream:
    # Check for less than 300 examples
    if len_exam <= 300:
        # Split up current example by new line
        split_txt = example['text'].split("\n")
        # Start current example counter
        counter = 0
        # Start current example placeholder
        placeholder = 1

        # Iterate through the new line splits of the current example
        for x in range(len(split_txt)):
            # Save current line
            row = split_txt[x]
            # Save its length
            char_len = len(row)
            # If our current new_txt is less than 1500 characters:
            if cum_len < 1500:
                # Then take the current example in the range of counter -> placeholder
                new_txt = "\n".join(split_txt[counter:placeholder])
                # Reassess the sumulative new_text length
                cum_len += char_len        
            # If new_txt is over 1500 characters
            else:
                # Then store new_txt into txts
                txts.append(new_txt)
                # Increase counter to placeholder
                counter = placeholder
                # Reset cumulative length
                cum_len = 0
            # Update placeholder
            placeholder += 1
        # Update length of txts
        len_exam = len(txts)
    # Otherwise Break
    else:
        break
    
# Store new texts into a DataFrame
df = pd.DataFrame({"texts":txts})
# Cleanup
df = df.dropna()
df = df.drop_duplicates()
df = df.loc[:299]
# Save
df.to_csv('../data/clean/nonfiction/case_law.csv')

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

# Spam Text

In [90]:
# DOWNLOAD NEEDED: Uncomment after you download the data

df = pd.read_csv('../data/raw/colloquial/spam_email.csv')
display(df)

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
...,...,...
83443,0,hi given a date how do i get the last date of ...
83444,1,now you can order software on cd or download i...
83445,1,dear valued member canadianpharmacy provides a...
83446,0,subscribe change profile contact us long term ...


In [91]:
emails = make_df(0, 'label')
emls_sampled = emails.sample(300).reset_index(drop=True)
eml_spam = make_df(1, 'label')
eml_spam_sampled = eml_spam.sample(150).reset_index(drop=True)

In [92]:
df = pd.read_csv('../data/raw/colloquial/spam_text.csv', encoding='ISO-8859-1')
df = df[['v1','v2']].rename(columns={'v1':'label','v2':'text'})
display(df)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [93]:
texts = make_df('ham', 'label')
txts_sampled = texts.sample(300).reset_index(drop=True)
txt_spam = make_df('spam', 'label')
txt_spam_sampled = txt_spam.sample(150).reset_index(drop=True)

In [94]:
cc_spam = pd.concat([eml_spam_sampled, txt_spam_sampled]).reset_index(drop=True)
key = {1:'email','spam':'text'}
cc_spam['label'] = cc_spam['label'].map(key)
cc_spam = cc_spam.rename(columns ={'label':'textType'})
cc_spam.to_csv('../data/clean/colloquial/spam_msgs_clean.csv', index=False)
txts_sampled.to_csv('../data/clean/colloquial/txt_msgs_clean.csv', index=False)
emls_sampled.to_csv('../data/clean/colloquial/email_msgs_clean.csv', index=False)