## Run only the first time

In [1]:
import pandas as pd
import spacy
import pprint
from matplotlib import pyplot as plt

In [2]:
from datasets import load_dataset

# Run only the first time
corpus = load_dataset('blog_authorship_corpus', download_mode='force_redownload')
pprint.pprint(corpus['train'][0])

Downloading builder script:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading and preparing dataset blog_authorship_corpus/blog_authorship_corpus (download: 603.58 MiB, generated: 758.24 MiB, post-processed: Unknown size, total: 1.33 GiB) to C:\Users\user\.cache\huggingface\datasets\blog_authorship_corpus\blog_authorship_corpus\1.0.0\6f5d78241afd8313111956f877a57db7a0e9fc6718255dc85df0928197feb683...


Downloading data:   0%|          | 0.00/633M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/689793 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/37919 [00:00<?, ? examples/s]

Dataset blog_authorship_corpus downloaded and prepared to C:\Users\user\.cache\huggingface\datasets\blog_authorship_corpus\blog_authorship_corpus\1.0.0\6f5d78241afd8313111956f877a57db7a0e9fc6718255dc85df0928197feb683. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

{'age': 17,
 'date': '23,November,2002',
 'gender': 'female',
 'horoscope': 'Libra',
 'job': 'Student',
 'text': "Yeah, sorry for not writing for a whole there, but I've had a pretty "
         'busy weekend so far. I found out I have about 20 tests on Tuesday '
         'because my teachers are evil...but oh well. We had marching '
         'yesterday. It was cool. We stayed inside the whole time, and I got '
         'to play second part, so the music part will be really easy. I left '
         'early from marching at around 5:00, then went out to dinner with my '
         'friends (I made the plans before I knew about marching). So, dinner '
         "was fun. It was Kelly's birthday, so it was cool. Then we went to "
         "Kel's for a sleepover, and did bunches of neat stuff, and I went to "
         'sleep earlier than usual for a sleepover since I had to be at a Bar '
         'Mitzvah this morning. It was pretty neat. I have to do this project '
         'for CR, and we have

In [3]:
train_df = corpus['train'].to_pandas()
test_df = corpus['validation'].to_pandas()

full_df = pd.concat([train_df, test_df], ignore_index=True)
print(train_df.shape[0] + test_df.shape[0] == full_df.shape[0])
full_df.sample(10)

True


Unnamed: 0,text,date,gender,age,horoscope,job
291152,Hey students...I am settling in and have alrea...,"23,July,2004",male,17,Aquarius,indUnk
680625,"son of a bitch, I forgot the Wednesday Best AG...","23,September,2003",female,16,Aries,Student
96450,"By the way, that was also not meant as a chast...","12,July,2003",male,23,Cancer,indUnk
617426,"Cum sucker, fetal rapist, Chunk the Drunk... A...","09,October,2003",female,25,Scorpio,Engineering
700667,urlLink the people i deal with every day,"08,May,2004",male,24,Taurus,indUnk
521885,"Look, I made a profile. I feel like such a los...","13,July,2004",male,25,Pisces,indUnk
286460,Tonight was knitting night. I am no longer the...,"30,May,2004",female,38,Virgo,indUnk
65743,Jama is leaving again! She's going to camp! :-...,"17,July,2004",female,15,Libra,indUnk
142553,Weâve been driving all this week through New...,"17,July,2004",female,27,Cancer,indUnk
104222,When the idea for The Thackery T. Lambshead P...,"24,June,2003",male,36,Cancer,indUnk


In [5]:
full_df.to_csv('blog_authorship_corpus.csv', index=False, encoding='utf-8')

## Run every other time

In [2]:
full_df = pd.read_csv('blog_authorship_corpus.csv', encoding='utf-8')
full_df.shape

(727712, 6)

In [3]:
full_df[full_df['text'].isnull()]

Unnamed: 0,text,date,gender,age,horoscope,job
196562,,"29,April,2004",female,23,Capricorn,Communications-Media


In [4]:
full_df = full_df.dropna()

## Cleaning the dataframe

In [5]:
full_df = full_df.drop(columns=['date'])

In [6]:
# How many rows contain 'urlLink'?
print(full_df['text'].str.contains('urlLink').sum())
full_df['text'] = full_df['text'].str.replace('urlLink', '')
print(full_df['text'].str.contains('urlLink').sum())

193495
0


In [7]:
len(full_df[full_df['age'] < 18])

247384

In [8]:
# Transform age into age groups
bins   = [0, 18, 24, 34, 100]
labels = ['<18', '18-24', '25-34', '>35']
full_df['age_group'] = pd.cut(full_df['age'], bins=bins, labels=labels)
full_df['age_group'].value_counts(sort=False, dropna=False)

<18      247384
18-24    166394
25-34    222498
>35       91435
Name: age_group, dtype: int64

In [9]:
full_df = full_df.drop(columns=['age'])

In [10]:
# Convert columns to categories
cat_cols = full_df.columns.difference(['text']).tolist()
for col in cat_cols:
    full_df[col] = full_df[col].astype('category')
    vc = full_df[col].value_counts()
    print(f'{col.upper()}', vc, sep='\n', end='\n\n')

AGE_GROUP
<18      247384
25-34    222498
18-24    166394
>35       91435
Name: age_group, dtype: int64

GENDER
male      370786
female    356925
Name: gender, dtype: int64

HOROSCOPE
Aries          71636
Virgo          68490
Taurus         67868
Cancer         66944
Libra          63994
Leo            59801
Scorpio        58870
Gemini         56921
Pisces         56481
Sagittarius    52753
Capricorn      52206
Aquarius       51747
Name: horoscope, dtype: int64

JOB
indUnk                     263240
Student                    164288
Technology                  53882
Arts                        34772
Education                   30919
Communications-Media        20649
Internet                    17621
Non-Profit                  16015
Engineering                 12386
Law                          9156
Science                      7819
Publishing                   7718
Government                   6943
Consulting                   5956
Religion                     5431
Advertising        

In [11]:
full_df['n_words'] = full_df['text'].apply(lambda x: len(x.split()))
print(full_df.shape[0])
full_df = full_df[full_df['n_words'] > 10] # drop short texts
print(full_df.shape[0])

727711
635599


In [12]:
full_df['text'] = full_df['text'].str.replace(r'https?:\/\/.*[\r\n\s]*', ' ', regex=True) # remove URLs
full_df['text'] = full_df['text'].str.replace(r'&[a-z]+;', ' ', regex=True) # remove html entities
full_df['text'] = full_df['text'].str.replace(r'\s+', ' ', regex=True) # remove multiple spaces

In [13]:
full_df.to_csv('blog_authorship.csv', index=False, encoding='utf-8')

## Tokenize

In [14]:
nlp = spacy.load('en_core_web_md')

def tokenize(text):
    doc = nlp(text)
    return " ".join([token.text.lower() for token in doc])

full_df['tokenized'] = full_df['text'].apply(tokenize)
full_df.sample(15)

Unnamed: 0,text,gender,horoscope,job,age_group,n_words,tokenized
144099,I was looking through some other blogs today a...,female,Libra,Education,25-34,101,i was looking through some other blogs today a...
557753,I noticed that I am writing way more than I in...,male,Capricorn,Internet,>35,201,i noticed that i am writing way more than i in...
648631,"My day was interesting enough. God, I hate my ...",female,Virgo,Student,<18,235,"my day was interesting enough . god , i hate m..."
273381,.....and he emerged from the dark clouds and s...,male,Sagittarius,indUnk,<18,318,..... and he emerged from the dark clouds and ...
110954,"The vomit's coming, Bill. I promise. To Contin...",female,Virgo,indUnk,>35,247,"the vomit 's coming , bill . i promise . to co..."
561162,I promised you a diatribe about the state of j...,female,Libra,Education,25-34,614,i promised you a diatribe about the state of j...
572565,something i know is i dont know wat to write h...,male,Sagittarius,Student,<18,152,something i know is i do nt know wat to write ...
156594,"Hey, I missed a couple of days, but that's ok....",male,Libra,Education,25-34,164,"hey , i missed a couple of days , but that 's ..."
446580,Sudden Flash Back. I think I'm getting my memo...,female,Scorpio,Communications-Media,25-34,188,sudden flash back . i think i 'm getting my me...
80368,Iraqi Weapons Only One Reason for War-Wolfowit...,male,Aquarius,Education,>35,393,iraqi weapons only one reason for war - wolfow...


In [15]:
full_df.to_csv('blog_authorship_tokenized.csv', index=False, encoding='utf-8')