In [1]:
import re
from glob import glob
from lxml import etree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
pan15_docs = pd.DataFrame()

for file in glob('PAN15-training/*.xml'):
    tree = etree.parse(file)
    root = tree.getroot()
    author = root.attrib['id']
    documents = [child.text for child in root.findall('document')]
    author_df = pd.DataFrame({'author': author, 'documents': documents})
    pan15_docs = pd.concat([pan15_docs, author_df], ignore_index=True)

pan15_docs.shape

(14166, 2)

In [3]:
pan15_docs.head()

Unnamed: 0,author,documents
0,02ae95de-7ee3-453a-978d-25d28b3f1a88,Things I want for my business cards but are to...
1,02ae95de-7ee3-453a-978d-25d28b3f1a88,"""painters produced their most highly valued wo..."
2,02ae95de-7ee3-453a-978d-25d28b3f1a88,@username your new discussion layout is confus...
3,02ae95de-7ee3-453a-978d-25d28b3f1a88,I never really understood why game environment...
4,02ae95de-7ee3-453a-978d-25d28b3f1a88,"@username 20k and 2048² on a gun, fine. But th..."


In [4]:
pan15_docs.dtypes

author       object
documents    object
dtype: object

In [5]:
# Load labels
pan15_labels = pd.read_csv('PAN15-labels.txt', delimiter=':::', engine='python', header=None, names=['author', 'gender', 'age_group', 'extroverted', 'stable', 'agreeable', 'conscientious', 'open'])
pan15_labels.shape

(152, 8)

In [6]:
pan15_labels.dtypes

author            object
gender            object
age_group         object
extroverted      float64
stable           float64
agreeable        float64
conscientious    float64
open             float64
dtype: object

In [7]:
pan15_docs.shape, pan15_labels.shape

((14166, 2), (152, 8))

In [8]:
pan15_labels.sample(5)

Unnamed: 0,author,gender,age_group,extroverted,stable,agreeable,conscientious,open
127,25789f19-6181-492d-9f0d-d3b6d07fcc97,F,35-49,0.4,0.3,0.2,0.4,0.5
143,3aae4b14-7dfe-4cb7-b82a-629d01d06da0,F,25-34,0.0,0.1,0.0,0.1,0.5
133,17591594-b75c-47c2-be3e-5dcbe6c45286,M,25-34,0.2,0.4,0.2,0.0,0.2
136,7175696d-d60e-44ed-aebc-1f8582567995,M,25-34,0.1,0.2,0.3,0.3,0.1
84,61c4aca8-6e7f-4da3-9b30-1f99f7656189,M,25-34,0.1,0.2,0.3,0.3,0.1


In [9]:
pan15 = pd.merge(pan15_labels, pan15_docs, on=['author'])
pan15.shape

(14166, 9)

In [10]:
pan15.sample(10)

Unnamed: 0,author,gender,age_group,extroverted,stable,agreeable,conscientious,open,documents
2375,f28ad171-3a6b-4f88-a3c9-feebe7b86747,F,35-49,0.4,0.3,0.2,0.4,0.5,Is Honey Boo Boo the Definition of an American...
13501,f3af2ca2-dbfb-4447-a4f2-dfdfdda58640,M,18-24,0.1,0.2,0.3,0.1,0.2,"Free your mind, let your consious be free.\t\t"
7470,3a396b48-d99c-4b22-9040-a50b6976f7cd,F,18-24,0.1,-0.3,0.2,-0.2,0.5,- Stand by me.\n- I will stand.\t\t
7394,f7ea5331-febf-4ed4-9416-7353c898ad35,M,25-34,0.1,0.3,0.2,0.3,0.0,Jolly Molly Jolly Molly Jolly Molly Jolly Moll...
6853,2b27ab92-c89d-4529-a238-905974a11cfd,M,25-34,-0.1,0.2,0.1,0.2,0.3,#LinkedIn Adds Trending Content List And A Sco...
9014,c0f2ba6b-200e-44fb-ac60-81084eec9daa,M,18-24,0.1,0.4,0.0,0.1,0.2,@username @username it was @username but now I...
6322,81584f97-1167-49e0-8da0-0d4d66f8e238,F,35-49,0.4,0.5,0.1,0.2,0.4,@username @username RT @username Do you know o...
774,18695d44-ddf4-4839-9b00-d87b93f2d4ec,M,18-24,0.1,-0.1,0.2,0.0,0.2,"""Hey yeah, dont let 'em know we're coming"".\t\t"
6318,81584f97-1167-49e0-8da0-0d4d66f8e238,F,35-49,0.4,0.5,0.1,0.2,0.4,@username hipster camper!\t\t
8625,89fab4e9-24f0-4a97-8e10-ca357cc7865c,F,35-49,0.0,0.3,0.5,0.4,0.2,We the undersigned members of the European neu...


In [11]:
pan15['author'] = pan15['author'].apply(lambda x: x.split('-')[0]) # simplify author hash
pan15.sample(5)

Unnamed: 0,author,gender,age_group,extroverted,stable,agreeable,conscientious,open,documents
308,e78b60f0,F,18-24,0.1,0.1,-0.1,0.5,0.1,"""Don't take me the wrong way... But...""\t\t"
3793,3be30fbc,M,25-34,0.2,0.2,0.1,0.1,0.5,@username But today is not that day!! http://t...
5681,3d50490f,F,25-34,0.2,0.1,0.2,0.1,0.1,"My soundtrack: ? ""Anyone Else But You"" by The ..."
12540,1d2a2248,F,50-XX,-0.2,-0.3,-0.1,0.0,0.5,@username Oh No!!!! Can you go back to dr? ...
12934,c657394c,F,35-49,0.4,0.5,0.1,0.2,0.4,"@username tokrat bolj službene teme, machine t..."


In [12]:
def clean(doc):
    doc = doc.lower()
    doc = re.sub(r'@\w+', '', doc) # remove @mentions
    doc = re.sub(r'#\w+', '', doc) # remove #hashtags
    doc = re.sub(r'http\S+', '', doc) # remove links
    doc = re.sub(r'([.,:!?])\1+', r'\1', doc) # remove repeated punctuation
    doc = re.sub('\d', '0', doc) # replace numbers with 0
    doc = re.sub('\s+', ' ', doc)
    return doc

In [13]:
pan15['semicleaned'] = pan15['documents'].apply(clean)
pan15.sample(15)

Unnamed: 0,author,gender,age_group,extroverted,stable,agreeable,conscientious,open,documents,semicleaned
5948,6c5201d6,M,25-34,0.2,0.4,0.2,0.1,0.4,@username Trying to contact #Nuance marketing ...,trying to contact marketing office for academ...
4731,994c8615,M,25-34,0.1,0.4,0.1,0.0,0.4,[Neverhood OST] Southern Front Porch Whistler:...,[neverhood ost] southern front porch whistler:...
4710,994c8615,M,25-34,0.1,0.4,0.1,0.0,0.4,per me e` un si` - The Garden- What We Are htt...,per me e` un si` - the garden- what we are via
8219,62bffe2f,M,50-XX,0.3,-0.1,-0.1,0.0,-0.1,@username ¿son *aplicaciones web*?\t\t,¿son *aplicaciones web*?
13153,b7bc6377,M,35-49,0.5,-0.1,0.2,0.2,0.5,Got it? http://t.co/8OVyeeqn7N\t\t,got it?
5082,e2041e6d,F,25-34,0.3,0.2,0.2,0.2,0.1,updating #iPad to #iOS6 and reading about new ...,updating to and reading about new mobile healt...
10204,97432f40,M,35-49,0.2,0.2,0.1,0.3,0.3,What You Should Know About the New Skype Trans...,what you should know about the new skype trans...
10470,69ba7d3d,F,25-34,0.2,0.1,0.2,0.1,0.1,Sister in town @username #gift #vans #offthewa...,sister in town …
250,60468a75,M,18-24,0.1,0.2,0.1,0.1,0.1,''I entered into the bathroom and what happene...,''i entered into the bathroom and what happene...
7943,61c4aca8,M,25-34,0.1,0.2,0.3,0.3,0.1,before the rain #milan #italy #afterlight #vsc...,before the rain loreto (milan metro)


In [14]:
import spacy

nlp = spacy.load('en_core_web_sm')

def tokenize(doc):
    return " ".join([token.lemma_ for token in nlp(doc)])

pan15['tokenized'] = pan15['semicleaned'].apply(tokenize)
pan15.sample(5)

Unnamed: 0,author,gender,age_group,extroverted,stable,agreeable,conscientious,open,documents,semicleaned,tokenized
7659,ed970294,M,18-24,0.1,0.2,0.2,0.0,0.1,@username then what are you talking about ????...,then what are you talking about ?,then what be you talk about ?
430,f1dcc4ff,F,18-24,0.0,-0.1,0.1,0.3,0.4,So you think you can dance kijken (:\t\t,so you think you can dance kijken (:,so you think you can dance kijken (:
2103,fde8eb00,F,25-34,0.3,0.2,0.2,0.2,0.1,Photo: Virtual Museum Showcases 130 Years Of N...,photo: virtual museum showcases 000 years of n...,photo : virtual museum showcase 000 year of nu...
1047,bde7a38f,F,25-34,0.0,-0.2,-0.3,0.3,0.2,Do what you will but please don't shut me out....,do what you will but please don't shut me out.,do what you will but please do not shut I out .
4427,3f27d01d,M,35-49,0.1,0.1,0.1,0.1,0.1,Integrating @username in the middle would be g...,"integrating in the middle would be great!, 'in...","integrate in the middle would be great ! , ' i..."


In [15]:
def pos_fw(doc):
    return " ".join([token.pos_ if not token.is_stop and not token.is_punct else token.text for token in nlp(doc)])

In [16]:
pan15['pos_fw'] = pan15['tokenized'].apply(pos_fw)
pan15.head()

Unnamed: 0,author,gender,age_group,extroverted,stable,agreeable,conscientious,open,documents,semicleaned,tokenized,pos_fw
0,3e2cdc34,M,25-34,0.3,0.5,0.1,0.2,0.2,How to Test Your Startup Idea for $50 http://t...,how to test your startup idea for $00,how to test your startup idea for $ 00,how to VERB your NOUN NOUN for SYM NUM
1,3e2cdc34,M,25-34,0.3,0.5,0.1,0.2,0.2,@username @username @username @username @usern...,"you've been quoted in my story ""new story""","you 've be quote in my story "" new story ""","SPACE you ' AUX be INTJ in my NOUN "" ADJ NOUN """
2,3e2cdc34,M,25-34,0.3,0.5,0.1,0.2,0.2,New Story http://t.co/Uu5AggZP #storify #cacer...,new story,new story,ADJ NOUN
3,3e2cdc34,M,25-34,0.3,0.5,0.1,0.2,0.2,@username @username @username @username You've...,you've been quoted in my story,you 've be quote in my story,SPACE you ' AUX be INTJ in my NOUN
4,3e2cdc34,M,25-34,0.3,0.5,0.1,0.2,0.2,@username @username @username @username @usern...,you've been quoted in my story,you 've be quote in my story,SPACE you ' AUX be INTJ in my NOUN


In [17]:
pan15.to_csv('pan15_semicleaned2.tsv', sep='\t', index=False)