In [1]:
import re
import glob
from lxml import etree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
pan15 = pd.DataFrame()

for file in glob.glob('PAN15-training/*.xml'):
    tree = etree.parse(file)
    root = tree.getroot()
    author = root.attrib['id']
    documents = [child.text for child in root.findall('document')]
    author_df = pd.DataFrame({'author': author, 'documents': documents})
    pan15 = pd.concat([pan15, author_df], ignore_index=True)

pan15.shape

(14166, 2)

In [3]:
pan15['author'] = LabelEncoder().fit_transform(pan15['author'])
pan15.sample(15)

Unnamed: 0,author,documents
9327,98,Don't - @username ????????\t\t
5377,56,"“@username: RT@username ""Goals are the links i..."
11807,127,"Sobre el famoso índice H, 'Study Attempts To P..."
721,7,@username: Have fun and don't get scared with ...
1587,16,@username I knew you would love it- the whole ...
4848,51,i wouldn't mind these on Ryanair or easyJet………...
9885,105,What is a Bait & Switch? http://t.co/2z5JaIO\t\t
849,8,#approfondirecapolavori ? Down Colorful Hill –...
4049,43,Boosh! I just scored 63.34 Million in #SuperMA...
4539,48,I'm too stubborn to ask for help when I need i...


In [4]:
def clean_doc(doc):
    doc = doc.lower()
    doc = re.sub(r'@\w+', '', doc) # remove @mentions
    doc = re.sub(r'([.,:!?])\1+', r'\1', doc) # remove repeated punctuation
    doc = re.sub(r'http\S+', '', doc) # remove URLs
    doc = re.sub('\t', ' ', doc) 
    doc = re.sub('\s+', ' ', doc)
    return doc

In [5]:
pan15['semicleaned'] = pan15['documents'].apply(clean_doc)
pan15.sample(15)

Unnamed: 0,author,documents,semicleaned
5291,55,Hilarious! -- Presidential Monster Action Figu...,hilarious! -- presidential monster action figu...
11067,118,@username @username badass he David\t\t,badass he david
1790,18,"""What would I recommend learning? [list 7 lang...","""what would i recommend learning? [list 7 lang..."
1012,10,"""We should be facing the truth"".\t\t","""we should be facing the truth""."
1076,11,Ielgh.. Ik hoor Musiq Soulchild op de radio. F...,ielgh. ik hoor musiq soulchild op de radio. f*...
12406,133,Man Terrified During Test Drive: http://t.co/N...,man terrified during test drive: vía
3843,41,"“@username: @username Saturday ????”< exactly,...","“: saturday ?”< exactly, years ago ?"
10389,110,“@username: Racism is stupid. http://t.co/QH2S...,“: racism is stupid.
7967,84,@username agreed!!!\t\t,agreed!
3264,35,“@username: Shes a little scared to get close ...,“: shes a little scared to get close to anyone...


In [6]:
pan15.to_csv('pan15_semicleaned.tsv', sep='\t', index=False)