In [1]:
import re
from glob import glob
from lxml import etree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
pan15_docs = pd.DataFrame()

for file in glob('PAN15-training/*.xml'):
    tree = etree.parse(file)
    root = tree.getroot()
    author = root.attrib['id']
    documents = [child.text for child in root.findall('document')]
    author_df = pd.DataFrame({'author': author, 'documents': documents})
    pan15_docs = pd.concat([pan15_docs, author_df], ignore_index=True)

pan15_docs.shape

(14166, 2)

In [3]:
pan15_docs.head()

Unnamed: 0,author,documents
0,02ae95de-7ee3-453a-978d-25d28b3f1a88,Things I want for my business cards but are to...
1,02ae95de-7ee3-453a-978d-25d28b3f1a88,"""painters produced their most highly valued wo..."
2,02ae95de-7ee3-453a-978d-25d28b3f1a88,@username your new discussion layout is confus...
3,02ae95de-7ee3-453a-978d-25d28b3f1a88,I never really understood why game environment...
4,02ae95de-7ee3-453a-978d-25d28b3f1a88,"@username 20k and 2048² on a gun, fine. But th..."


In [4]:
pan15_docs.dtypes

author       object
documents    object
dtype: object

In [5]:
# Load labels
pan15_labels = pd.read_csv('PAN15-labels.txt', delimiter=':::', engine='python', header=None, names=['author', 'gender', 'age_group', '0', '1', '2', '3', '4'])
pan15_labels = pan15_labels[['author', 'gender', 'age_group']]
pan15_labels.shape

(152, 3)

In [6]:
pan15_labels.dtypes

author       object
gender       object
age_group    object
dtype: object

In [7]:
pan15_docs.shape, pan15_labels.shape

((14166, 2), (152, 3))

In [8]:
pan15_labels.sample(5)

Unnamed: 0,author,gender,age_group
128,c2253559-eb87-44ac-9a0b-a1d1a0cdfd48,F,18-24
70,5a3757da-1eda-4bb5-bf68-816fa10c2231,F,25-34
88,7f98ebb4-a2da-4579-9c3c-0f2fbaafc754,M,25-34
83,0b66092a-440e-4755-a624-759a580a1c70,F,25-34
95,c0f2ba6b-200e-44fb-ac60-81084eec9daa,M,18-24


In [9]:
pan15 = pd.merge(pan15_labels, pan15_docs, on=['author'])
pan15.shape

(14166, 4)

In [10]:
pan15.sample(10)

Unnamed: 0,author,gender,age_group,documents
175,b38fee80-2fb1-4481-be25-e8297aae928d,M,25-34,Last 9 days. Tensed.\t\t
13761,03f72f70-7a00-4dbb-93b7-6c7f65954fc5,M,25-34,R.I.P. Douglas Engelbart. Father of the Mother...
10679,be1f9286-68c1-4864-b269-4b6de3b8b7eb,M,18-24,Okayyyy so Asians can't handle booze @username...
2297,703806e5-f04b-4e7c-8456-7340784ecf76,M,25-34,Kiss me in D.A.R.K. Tonight and love me when i...
2248,703806e5-f04b-4e7c-8456-7340784ecf76,M,25-34,Para complementar mi springy look de pool part...
13080,e7d022a6-2292-4206-93f5-e63d3b0c3523,M,18-24,You look so cool\t\t
13384,78fb5696-dc1c-4ea9-b4e4-ae67acd7de72,F,25-34,Mornig buongiorno! Milano in grey today!\t\t
13628,c7537d18-440b-451a-991a-22cb30f28d3a,F,50-XX,@username Heart-Breaker!!\t\t
3087,675ce760-4be6-46ef-8190-2009f80fe283,F,18-24,RT @username: Loyalty ain't shit these days I ...
13897,6919c68c-ce0f-47b7-a5f7-9e55eef6a557,F,25-34,Typhoon Kalmaegi hits south China - http://t.c...


In [11]:
pan15['author'] = pan15['author'].apply(lambda x: x.split('-')[0]) # simplify author hash
pan15.sample(5)

Unnamed: 0,author,gender,age_group,documents
4868,717b2dd5,M,25-34,.@username #loveskiva (Tweet [your school name...
4125,34e43885,M,25-34,"Coursera Lands $43M From The World Bank, Yuri ..."
13527,f3af2ca2,M,18-24,Fucked up\nInsecure\nNeurotic\nEmotional\n\nSo...
7119,fd7c89ad,F,18-24,"I spy to change the world, to change your worl..."
1668,92defdbb,F,18-24,"I belong with you, you belong with me, you're ..."


In [12]:
def clean(doc):
    doc = doc.lower()
    doc = re.sub(r'@\w+', '', doc) # remove @mentions
    doc = re.sub(r'#\w+', '', doc) # remove #hashtags
    doc = re.sub(r'http\S+', '', doc) # remove links
    doc = re.sub(r'([.,:!?])\1+', r'\1', doc) # remove repeated punctuation
    doc = re.sub('\t', ' ', doc)
    doc = re.sub('\s+', ' ', doc)
    return doc

In [13]:
pan15['semicleaned'] = pan15['documents'].apply(clean)
pan15.sample(15)

Unnamed: 0,author,gender,age_group,documents,semicleaned
3416,a49c605b,M,35-49,"""5 Unconventional Tips for Finding and Followi...","""5 unconventional tips for finding and followi..."
13261,852149e1,F,25-34,3rd 9-hour workday in a row :( #sotired :(\t\t,3rd 9-hour workday in a row :( :(
4244,956f4dba,F,18-24,@username really wish youd notice me but we al...,really wish youd notice me but we all know th...
12065,88d5bfa3,F,25-34,When Cesare attacks Juan at dinner. Just yes. ...,when cesare attacks juan at dinner. just yes.
14142,c4a413a1,F,25-34,Photo: Y en este Día Internacional del Hombre…...,photo: y en este día internacional del hombre…...
3557,a6323e6d,F,25-34,Unlikely reviewer @username analyzes @username...,unlikely reviewer analyzes 's farewell album '...
1973,bb88ec91,F,18-24,If you ain't happy with your life then you ain...,if you ain't happy with your life then you ain...
8820,2298a7cc,F,35-49,Announcing the All-New St. Louis Observer http...,announcing the all-new st. louis observer
9889,1661bc1b,F,35-49,"@username: We met for a reason, either you're ...",": we met for a reason, either you're a blessin..."
12185,91c6d3f2,M,18-24,Oh they got me twisted they don't know that bo...,oh they got me twisted they don't know that bo...


In [14]:
import spacy

nlp = spacy.load('en_core_web_sm')

def tokenize(doc):
    return " ".join([token.lemma_ for token in nlp(doc)]).strip()

pan15['tokenized'] = pan15['semicleaned'].apply(tokenize)
pan15.sample(5)

Unnamed: 0,author,gender,age_group,documents,semicleaned,tokens
8590,89fab4e9,F,35-49,Remembering Steven Yantis > Abrupt visual onse...,remembering steven yantis > abrupt visual onse...,remember steven yantis > abrupt visual onset a...
6679,5a3757da,F,25-34,I'm just gonna sit here and wish you missed me...,i'm just gonna sit here and wish you missed me...,I be just go to sit here and wish you miss I t...
10331,d7f46166,F,50-XX,@username @username @username - Pockie and Ge...,- pockie and george washington - - they canno...,- pockie and george washington - - they can no...
2269,703806e5,M,25-34,A handsome man for a cute ice cream. @username...,a handsome man for a cute ice cream. televisa ...,a handsome man for a cute ice cream . televisa...
5701,3d50490f,F,25-34,"My soundtrack: ? ""Piove"" by Jovanotti http://t...","my soundtrack: ? ""piove"" by jovanotti","my soundtrack : ? "" piove "" by jovanotti"


In [15]:
pan15.to_csv('pan15_semicleaned.tsv', sep='\t', index=False)