# Article Spinner

In [1]:
import string
from collections import defaultdict
from functools import partial

import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import TreebankWordDetokenizer, word_tokenize

In [2]:
random = np.random.default_rng(29587)

In [3]:
try:
    word_tokenize('hello world')
except:
    nltk.download('punkt')

In [4]:
!wget -O files/bbc_text_cls.csv -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘files/bbc_text_cls.csv’ already there; not retrieving.


In [5]:
df = pd.read_csv('files/bbc_text_cls.csv')

In [6]:
documents = df.loc[df['labels'] == 'business']['text']
documents[:5]

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [7]:
A = defaultdict(partial(defaultdict, partial(defaultdict, int)))

In [8]:
for document in documents:
    tokens = word_tokenize(document)
    for i, token in enumerate(tokens):
        if i == 0 or i == len(tokens) - 1:
            continue
        A[tokens[i-1]][tokens[i+1]][token] += 1

In [9]:
A['production']['to']

defaultdict(int,
            {'unit': 1,
             'began': 1,
             'closer': 1,
             'struggles': 1,
             'capacity': 1,
             'facilities': 1,
             'continued': 1})

In [10]:
def choice(values: 'dict[str, float]') -> str:
    items, probs = zip(*values.items())
    total = np.sum(probs)
    rand = random.uniform() * total
    cumsum = 0
    for i, (prob, item) in enumerate(zip(probs, items)):
        cumsum += prob  # type: ignore
        if cumsum > rand:
            return item  # type: ignore
    assert False

In [11]:
def can_spin(token):
    if any(map('1234567890'.__contains__, token)):
        return False
    # if token[:1].isupper():
    #     return False
    return True

In [12]:
def can_replace(token, prev=None):
    if token in string.punctuation:
        return False
    if token == prev:
        return False
    if prev is not None and token[:1].upper() != prev[:1].upper():
        return False
    return True

In [13]:
def spin(document, percentage=1):
    tokens = word_tokenize(document)
    changed = 0
    flag = False
    for i in range(len(tokens)):
        if flag:
            flag = False
            continue
        token = tokens[i]
        if not can_spin(token):
            continue
        if i == 0 or i == len(tokens) - 1:
            continue
        prev = tokens[i - 1]
        if prev not in A:
            continue
        outer = A[prev]
        after = tokens[i + 1]
        if after not in outer or not any(
            map(partial(can_replace, prev=token), outer[after])
        ):
            continue
        mapping = outer[after]
        rand = random.uniform()
        if rand < percentage:
            word = token
            while not can_replace(word, token):
                word = choice(mapping)
            print(token, word)
            tokens[i] = word
            changed += 1
            flag = True
    twd = TreebankWordDetokenizer()
    return twd.detokenize(tokens), changed / len(tokens)

In [14]:
document = documents[random.choice(len(documents))]
(document, spin(document))

agreed access
pay put
Indonesian IMF
a an
conducted calculated
cotton case
penalty poll
agreed applies
three two
It IDS
what which
give get
Indonesia India
company country
disguise demand
invoice invitation
bribe boxes
Monsanto MCI
activists Asia
plans proposal
Indonesia Iraq
Despite During
bribe brand
authorise against
waiving wake
the their
Monsanto MCI
admitted agreed
number narrowing
chemicals-and-crops combined
irregularities inflation
internal in-depth
Justice justice
Securities social
Exchange European
SEC SUVs
criminal creditors
Department development
Justice Japan
SEC sector
bribe bide
assistant a
Monsanto Menatep
agreed admitted
pay provide
Department discovery
investigations investigation
It India
SEC state
settle shift
Monsanto Malaysia
actions aircraft
adding arguing
address allay
activities ante
and Analysts
throughout to


('Monsanto fined $1.5m for bribery\n\nThe US agrochemical giant Monsanto has agreed to pay a $1.5m (£799,000) fine for bribing an Indonesian official.\n\nMonsanto admitted one of its employees paid the senior official two years ago in a bid to avoid environmental impact studies being conducted on its cotton. In addition to the penalty, Monsanto also agreed to three years\' close monitoring of its business practices by the American authorities. It said it accepted full responsibility for what it called improper activities.\n\nA former senior manager at Monsanto directed an Indonesian consulting firm to give a $50,000 bribe to a high-level official in Indonesia\'s environment ministry in 2002. The manager told the company to disguise an invoice for the bribe as "consulting fees".\n\nMonsanto was facing stiff opposition from activists and farmers who were campaigning against its plans to introduce genetically-modified cotton in Indonesia. Despite the bribe, the official did not authorise 