In [1]:
import pandas as pd
import spacy

In [2]:
df_init = pd.read_csv('tokenized/blog_authorship_tokenized.csv').drop('n_words', axis=1)
df_init.sample(5)

Unnamed: 0,text,gender,horoscope,job,age_group,tokenized
132436,"It's 3:00 and I am bored at work, so why not p...",female,Aries,indUnk,18-24,"it 's 3:00 and i am bored at work , so why not..."
217103,"As of late, I've been feeling like shit, parct...",female,Pisces,Student,<18,"as of late , i 've been feeling like shit , pa..."
626553,No i've not been dead or in a coma or somin el...,male,Aquarius,Student,<18,no i 've not been dead or in a coma or somin e...
80832,Too Early... i got to school ( UA&P ) too earl...,male,Aries,indUnk,18-24,too early ... i got to school ( ua&p ) too ear...
343573,""" R adiohead is concise..Mars Volta tools alot...",male,Leo,Consulting,25-34,""" r adiohead is concise .. mars volta tools al..."


In [3]:
nlp = spacy.load('en_core_web_md')

def get_pos_features(text):
    doc = nlp(text, disable=['parser', 'ner'])
    return " ".join([token.pos_ for token in doc])

In [4]:
df_init = df_init.assign(pos=df_init['text'].apply(get_pos_features))
df_init.sample(10)

Unnamed: 0,text,gender,horoscope,job,age_group,tokenized,pos
369149,Het is de afgelope dagen echt verschrikkelijk ...,male,Aries,Non-Profit,<18,het is de afgelope dagen echt verschrikkelijk ...,PROPN AUX PROPN PROPN PROPN PROPN PROPN PROPN ...
356104,So the couple I had slept with back during win...,male,Pisces,Student,18-24,so the couple i had slept with back during win...,ADV DET NOUN PRON AUX VERB ADP ADV ADP NOUN VE...
455831,"Here at Toyah Creek, we consider it our missio...",male,Virgo,Student,18-24,"here at toyah creek , we consider it our missi...",ADV ADP PROPN PROPN PUNCT PRON VERB PRON PRON ...
344621,finally got my PC back home and online... but ...,female,Cancer,indUnk,18-24,finally got my pc back home and online ... but...,ADV VERB PRON NOUN ADV ADV CCONJ ADV PUNCT CCO...
285462,"so athens may not be ready for the olympics, b...",female,Virgo,indUnk,25-34,"so athens may not be ready for the olympics , ...",ADV PROPN AUX PART AUX ADJ ADP DET NOUN PUNCT ...
424628,I haven't posted in like a bagillion years. I'...,male,Virgo,Non-Profit,<18,i have n't posted in like a bagillion years . ...,PRON AUX PART VERB ADP ADP DET NOUN NOUN PUNCT...
440239,"Just when youâre trying to forget someone, t...",female,Gemini,Student,<18,"just when youâre trying to forget someone , ...",ADV SCONJ VERB VERB PART VERB PRON PUNCT VERB ...
517168,"Well, the back of the DVD didn't have a plot s...",male,Capricorn,indUnk,>35,"well , the back of the dvd did n't have a plot...",INTJ PUNCT DET NOUN ADP DET NOUN AUX PART AUX ...
249472,Hey! Look at my blog! it is different. woooooo...,female,Capricorn,Non-Profit,<18,hey ! look at my blog ! it is different . wooo...,INTJ PUNCT VERB ADP PRON NOUN PUNCT PRON AUX A...
542578,i'm uncomfortable. investigative reporting see...,male,Taurus,indUnk,25-34,i 'm uncomfortable . investigative reporting s...,PRON AUX ADJ PUNCT ADJ NOUN VERB ADV ADJ ADP P...


In [5]:
from string import punctuation

def total_punct(text: str) -> int:
    return sum([1 for char in text if char in punctuation])

def punct_dist(text: str) -> float:
    return sum([1 for char in text if char in punctuation]) / len(text)

def count_punct(text: str) -> dict:
    return {char: text.count(char) for char in punctuation}

In [6]:
df = df_init.copy()
df = df.assign(
    total_punct=df_init['text'].apply(total_punct),
    punct_dist=df_init['text'].apply(punct_dist),
    count_punct=df_init['text'].apply(count_punct)
)
df.sample(10)

Unnamed: 0,text,gender,horoscope,job,age_group,tokenized,pos,total_punct,punct_dist,count_punct
217067,"Hello there. It is now Saturday, July 17 and I...",female,Gemini,indUnk,25-34,"hello there . it is now saturday , july 17 and...",INTJ ADV PUNCT PRON AUX ADV PROPN PUNCT PROPN ...,12,0.085106,"{'!': 5, '""': 0, '#': 0, '$': 0, '%': 0, '&': ..."
166646,"and another thing: ""I need to finish this up,""...",female,Cancer,indUnk,25-34,"and another thing : "" i need to finish this up...",CCONJ DET NOUN PUNCT PUNCT PRON VERB PART VERB...,35,0.061837,"{'!': 0, '""': 4, '#': 0, '$': 0, '%': 0, '&': ..."
456355,"Well, well... look who's starting an online bl...",female,Libra,indUnk,<18,"well , well ... look who 's starting an online...",INTJ PUNCT INTJ PUNCT VERB PRON AUX VERB DET A...,100,0.079114,"{'!': 1, '""': 0, '#': 0, '$': 0, '%': 0, '&': ..."
231482,Allison: you really musn't bohter with the hyp...,male,Aries,Consulting,<18,allison : you really musn't bohter with the hy...,PROPN PUNCT PRON ADV VERB ADJ ADP DET NOUN PUN...,115,0.156889,"{'!': 85, '""': 0, '#': 0, '$': 0, '%': 0, '&':..."
377849,"I sent an e-card to my ex this morning, to wis...",female,Leo,indUnk,18-24,"i sent an e - card to my ex this morning , to ...",PRON VERB DET NOUN NOUN NOUN ADP PRON NOUN DET...,48,0.046693,"{'!': 7, '""': 0, '#': 0, '$': 0, '%': 0, '&': ..."
512953,"""The best way to show our gratitude . . . is t...",male,Aries,indUnk,25-34,""" the best way to show our gratitude . . . is ...",PUNCT DET ADJ NOUN PART VERB PRON NOUN PUNCT P...,8,0.086022,"{'!': 0, '""': 2, '#': 0, '$': 0, '%': 0, '&': ..."
375704,"Divya, Dr Who is not a disease, it's a lifesty...",female,Capricorn,indUnk,<18,"divya , dr who is not a disease , it 's a life...",PROPN PUNCT PROPN PRON AUX PART DET NOUN PUNCT...,98,0.053059,"{'!': 7, '""': 4, '#': 0, '$': 0, '%': 0, '&': ..."
513210,"""It is righteousness and not peace which shoul...",male,Aries,indUnk,25-34,""" it is righteousness and not peace which shou...",PUNCT PRON AUX ADJ CCONJ PART NOUN DET AUX VER...,4,0.016807,"{'!': 0, '""': 2, '#': 0, '$': 0, '%': 0, '&': ..."
200209,I am so sick....lol THANKS ED. Although I have...,female,Aquarius,Student,25-34,i am so sick .... lol thanks ed . although i h...,PRON AUX ADV ADJ PUNCT INTJ PROPN PROPN PUNCT ...,25,0.047348,"{'!': 0, '""': 0, '#': 0, '$': 0, '%': 0, '&': ..."
330219,I can not stand people! That is all...I am lea...,female,Cancer,Arts,18-24,i can not stand people ! that is all ... i am ...,PRON AUX PART VERB NOUN PUNCT DET AUX DET PUNC...,5,0.064935,"{'!': 1, '""': 0, '#': 0, '$': 0, '%': 0, '&': ..."


In [8]:
df.to_csv('tokenized/blog_authorship_tokenized_pos_punct_final.csv', index=False)