In [3]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from ast import literal_eval
from html.parser import HTMLParser
import pickle
from nltk import sent_tokenize, word_tokenize
import random

tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# read XML file and put into pd df

In [84]:
acrofile = 'data/acrolinx.WordPress.2019-05-21.xml'

In [85]:
tree = ET.parse(acrofile)
root = tree.getroot()
channel = tree.find('channel')

In [86]:
titles = []
dates = []
categories = []
texts = []

for item in channel:
    if item.tag == 'item':
        titles.append(item.find('title').text)
        dates.append(item.find('pubDate').text)
        
        texts.append(item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text)
        
        cats = []
        for child in item:
            if child.tag == 'category':
                cats.append(child.text)
        categories.append(list(set(cats)))

In [87]:
assert len(titles) == len(dates) == len(categories) == len(texts)

In [88]:
df = pd.DataFrame({'title': titles, 
                  'pubdate': dates,
                  'cats': categories,
                  'text': texts})

In [89]:
df.head()

Unnamed: 0,cats,pubdate,text,title
0,"[Content Marketing, Tech Comm, Blog]","Thu, 20 Nov 2014 12:49:29 +0000",Imagine that you’ve just written what you beli...,10 Questions You’ve Got to Ask Yourself Before...
1,"[Content Marketing, Tech Comm, Love of Languag...","Mon, 17 Nov 2014 12:51:04 +0000","We recently shared <a href=""https://www.acroli...",Problem Words that Give Writers Trouble (Part 2)
2,"[Content Marketing, Tech Comm, Love of Languag...","Thu, 13 Nov 2014 12:52:27 +0000",Even the best writers can get tripped up somet...,Problem Words that Give Writers Trouble (Part 1)
3,"[Content Marketing, Tech Comm, Love of Languag...","Mon, 10 Nov 2014 12:55:17 +0000",Part of being a good content marketer is being...,6 Great Free (or Nearly Free) Tools for Conten...
4,"[Content Marketing, Tech Comm, Love of Languag...","Fri, 07 Nov 2014 12:57:14 +0000",Have you ever heard of Amy Cuddy? In addition ...,Why Writers Should Care About Their Posture


# text processing

In [90]:
# remove HTML tags
# https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

df['text'] = df['text'].apply(strip_tags)

Unnamed: 0,cats,pubdate,text,title
0,"[Content Marketing, Tech Comm, Blog]","Thu, 20 Nov 2014 12:49:29 +0000",Imagine that you’ve just written what you beli...,10 Questions You’ve Got to Ask Yourself Before...
1,"[Content Marketing, Tech Comm, Love of Languag...","Mon, 17 Nov 2014 12:51:04 +0000",We recently shared a post containing part 1 of...,Problem Words that Give Writers Trouble (Part 2)
2,"[Content Marketing, Tech Comm, Love of Languag...","Thu, 13 Nov 2014 12:52:27 +0000",Even the best writers can get tripped up somet...,Problem Words that Give Writers Trouble (Part 1)
3,"[Content Marketing, Tech Comm, Love of Languag...","Mon, 10 Nov 2014 12:55:17 +0000",Part of being a good content marketer is being...,6 Great Free (or Nearly Free) Tools for Conten...
4,"[Content Marketing, Tech Comm, Love of Languag...","Fri, 07 Nov 2014 12:57:14 +0000",Have you ever heard of Amy Cuddy? In addition ...,Why Writers Should Care About Their Posture


In [165]:
def clean_text(text):
    text = text.replace('\n', ' ')
    text = text.strip()
    text = text.replace('’', "'")
    text = text.replace('“', '"')
    text = text.replace('”', '"')
    return text

df.text = df.text.apply(clean_text)

In [167]:
df.to_pickle('data/acrolinx_blog.pkl')

# how many sentences? & format for OpenNMT use

In [168]:
sents = []

for text in df.text:
    sents = sents + sent_tokenize(text)

In [169]:
len(sents)

10919

In [170]:
sents[len(sents)-10:len(sents)]

["And bad results aren't necessarily all bad news.",
 "They give you the means to identify areas of concern and allow you to redirect resources where they're needed most.",
 'A robust set of analytics is a critical ingredient in the content governance process.',
 'Analytics allow you to evaluate and then optimize your content and your content creation processes before you ever publish anything.',
 'On the path to enterprise content governance.',
 'Getting your arms around your content operations can seem overwhelming, but this four-step action plan will start you on the path to active content governance.',
 "When coupled with an AI-powered content governance platform, you'll be able to capture your strategy, establish your current position, align every content contributor with the goals you've set, and measure and track your success.",
 "It's really the best way to eliminate content chaos from your business once and for all.",
 '✔ Capture your strategy   ✔ Establish where you are today

In [171]:
with open('data/OpenNMT files/src-acrolinx.txt', 'w') as f:
    for sent in sents:
        if len(sent) > 5:
            f.write(' '.join(word_tokenize(sent)) + '\n')

# look at results after OpenNMT inf->f translation

In [2]:
with open('data/OpenNMT files/src-acrolinx.txt', 'r') as f:
    src = f.readlines()

with open('data/OpenNMT files/pred-acrolinx.txt', 'r') as f:
    pred = f.readlines()

In [3]:
src = [x.strip() for x in src]
pred = [x.strip() for x in pred]

In [4]:
assert len(src) == len(pred)

In [5]:
for idx in random.sample(range(len(src)), 10):
    print(src[idx])
    print(pred[idx])
    print()

Not only do you understand what the authors are trying to convey , you also get the sense that MailChimp is n't some big , soulless corporation .
Do you understand what the authors are trying to convey ? You also get the sense of soulless , soulless corporation .

It 's also where technical content faces the biggest challenges as a result of three converging trends : A rapid acceleration in software development cycles , including online launches Online hosting with embedded content , where bite-sized content produced at frequent intervals is replacing massive tome-sized documents produced over the course of many months An aspiration for localized content in multiple languages timed with product releases Let 's consider these trends , and their implications for technical content , individually .
In multiple languages , it 's technical content faces , the biggest challenges faces , a result of three months is content produced in software releases so I can understand technical content .



In [7]:
doccano = pd.DataFrame()

texts = []
for idx in range(len(src)):
    texts.append(src[idx] + ' ' + pred[idx])

In [9]:
doccano['text'] = texts
doccano.head()

Unnamed: 0,text
0,Imagine that you 've just written what you bel...
1,You 're ready to get it off your plate and sen...
2,"But wait just a second ! But , wait , a second !"
3,"Before you hit the publish button , are you po..."
4,"After all , you 've probably worked hard to cr..."


In [14]:
doccano.to_csv('data/doccano.csv')

# GYAFC corpus to doccano format

In [6]:
all_df = pd.read_pickle('data/rule_based_corrected_df.pkl')
all_df[80000:80005]

Unnamed: 0,Original,Target 0,Target 1,Target 2,Target 3,Category,Dataset
80000,"I do not have an answer for the question, but ...","I don't know about that question, but I really...",,,,Entertainment_Music,train
80001,Do as you have been instructed.,Just do what everyone else said.,,,,Entertainment_Music,train
80002,"They got to is first, go ahead and go there.",They beat me to it... yes go there,,,,Entertainment_Music,train
80003,Did he not say he figured it out himself.,Didn't he say he came up with it himself,,,,Entertainment_Music,train
80004,"Yes it was, and I enjoyed it.",Yes it was I enjoyed it,,,,Entertainment_Music,train


In [8]:
texts = []
for idx, row in all_df.iterrows():
    text = row['Original']
    text = text + ' ' + str(row['Target 0'])
    text = text + ' ' + str(row['Target 1'])
    text = text + ' ' + str(row['Target 2'])
    text = text + ' ' + str(row['Target 3'])
    texts.append(text)

In [9]:
gyafc_docc = pd.DataFrame()
gyafc_docc['text'] = texts
gyafc_docc.head()

Unnamed: 0,text
0,I mean that you have to really be her friend. ...
1,Are you posing a rhetorical question? Sounds l...
2,Men pretend to love in order to have intercour...
3,I do not intend to be mean. I don't want to be...
4,I would estimate an average of 45% initially b...
