In [1]:
import re
import glob
from lxml import etree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
PAN15 = r'PAN15-training/'

pan15 = pd.DataFrame()

for file in glob.glob(PAN15 + '*.xml'):
    tree = etree.parse(file)
    root = tree.getroot()
    author = root.attrib['id']
    documents = [child.text for child in root.findall('document')]
    author_df = pd.DataFrame({'author': author, 'documents': documents})
    pan15 = pd.concat([pan15, author_df], ignore_index=True)

pan15.shape

(14166, 2)

In [3]:
pan15['author'] = LabelEncoder().fit_transform(pan15['author'])
pan15.sample(15)

Unnamed: 0,author,documents
11706,126,@username Goodmorning boooo\t\t
10041,106,@username @username @username whattttt?\t\t
13756,147,Dreaming about the things we could be.\t\t
7752,82,"@username Enjoy it! And rock the bass, the sex..."
6143,65,99 Red Balloons - played with red balloons.: h...
6675,71,Lkkr fout ohohcherso\t\t
4177,44,i really wanna shove their sticks up their ass...
13269,142,When people say really mean things to you.. ht...
3476,37,Gnip and Twitter Bringing Social Data to Acade...
418,4,@username Im going to see it on friday with my...


In [4]:
def clean_doc(doc):
    doc = doc.lower()
    doc = re.sub(r'@\w+', '', doc)
    doc = re.sub(r'([.,!?])\1+', r'\1', doc) # remove repeated punctuation
    doc = re.sub(r'http\S+', '', doc)
    doc = re.sub('\t', ' ', doc)
    doc = re.sub('\s+', ' ', doc)
    return doc

In [5]:
pan15['semicleaned'] = pan15['documents'].apply(clean_doc)
pan15.sample(15)

Unnamed: 0,author,documents,semicleaned
8117,86,ugg life is compilcated right now no time for ...,ugg life is compilcated right now no time for ...
5657,60,#EnLaOficina #lvklabs (@username St. Andrews) ...,#enlaoficina #lvklabs ( st. andrews)
4094,43,twitturk.com sayfam: http://twitturk.com/twitu...,twitturk.com sayfam:
8869,93,"Runaway train, never going back.\t\t","runaway train, never going back."
5141,54,lmaooo ???? “@username: Kids these days lol ??...,lmaooo ? “: kids these days lol ? #aysiaysi #t...
8455,89,Top Italian Investors Talk About Their Country...,top italian investors talk about their country...
1094,11,Pastry of Nike Air force halen? #dezezomer #wi...,pastry of nike air force halen? #dezezomer #wit
6412,68,[Photo] tools #evolution http://t.co/Qarh77TH\t\t,[photo] tools #evolution
8437,89,"30 tips for avoiding #startup failure, from su...","30 tips for avoiding #startup failure, from su..."
6100,64,Bitch please! @username @username Mexsi Bocu h...,bitch please! mexsi bocu


In [6]:
pan15.to_csv('pan15_semicleaned.csv', index=False)