In [32]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [33]:
df = pd.read_csv(f"../data/bbc_text_cls.csv")
print(df['labels'].value_counts())
df = df[df['labels'] == 'business']
print(df['labels'].value_counts())

labels
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64
labels
business    510
Name: count, dtype: int64


In [34]:
df.drop(columns='labels', inplace=True)
df['text'] = df['text'].str.replace(r'\n', ' ', regex=True)
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True)
df.head()

Unnamed: 0,text
0,Ad sales boost Time Warner profit Quarterly pr...
1,Dollar gains on Greenspan speech The dollar ha...
2,Yukos unit buyer faces loan claim The owners o...
3,High fuel prices hit BA's profits British Airw...
4,Pernod takeover talk lifts Domecq Shares in UK...


In [35]:
def get_tokens(text):
    return word_tokenize(text.lower())


In [36]:
df['text_list'] = df['text'].apply(get_tokens)
df.head()

Unnamed: 0,text,text_list
0,Ad sales boost Time Warner profit Quarterly pr...,"[ad, sales, boost, time, warner, profit, quart..."
1,Dollar gains on Greenspan speech The dollar ha...,"[dollar, gains, on, greenspan, speech, the, do..."
2,Yukos unit buyer faces loan claim The owners o...,"[yukos, unit, buyer, faces, loan, claim, the, ..."
3,High fuel prices hit BA's profits British Airw...,"[high, fuel, prices, hit, ba, 's, profits, bri..."
4,Pernod takeover talk lifts Domecq Shares in UK...,"[pernod, takeover, talk, lifts, domecq, shares..."


In [37]:
#unique word list
word_set = set()

for text in df['text_list']:
    word_set.update(text)
idx2word = list(word_set)
idx2word.insert(0, '<UNKNOWN>')

word2idx = {word:i for i, word in enumerate(idx2word)}
print(len(word2idx))
print(list(word2idx.items())[0:5])

13532
[('<UNKNOWN>', 0), ('centres', 1), ('network', 2), ('2-3', 3), ('£29bn', 4)]


In [38]:
def word_numerizer(arr):
    return [word2idx.get(word, 0) for word in arr]

In [39]:
df['text_list_int'] = df['text_list'].apply(lambda x: word_numerizer(x))
df['text_list_int']

0      [3138, 11276, 5629, 2316, 8970, 3468, 7027, 59...
1      [6379, 13368, 1662, 8299, 7525, 1372, 6379, 36...
2      [1960, 9211, 1412, 2554, 2760, 5834, 1372, 572...
3      [12795, 8186, 2736, 8940, 8250, 12647, 594, 80...
4      [4525, 1557, 11712, 7571, 12704, 5183, 13296, ...
                             ...                        
505    [3911, 10252, 4693, 3814, 12647, 10743, 2070, ...
506    [8971, 11487, 9780, 9688, 6659, 6504, 1372, 89...
507    [4935, 5542, 4364, 1662, 3911, 1372, 13339, 10...
508    [10894, 4643, 3075, 8658, 1662, 12269, 6014, 1...
509    [4796, 11237, 7302, 4670, 9442, 12398, 2430, 9...
Name: text_list_int, Length: 510, dtype: object

In [40]:
# Prepare A1, A2 and pi

A2 = {}

for sentence in df['text_list_int']: 
    #A2
    for word_index in range(len(sentence)-2):
        #key
        current_word = sentence[word_index]
        second_word  = sentence[word_index+1]
        third_word   = sentence[word_index+2]

        keytuple = (current_word, third_word)

        if keytuple not in A2:
            A2[keytuple] = {}
            A2[keytuple][second_word] = 1                    
        else:
            if second_word not in A2[keytuple]:
                A2[keytuple][second_word]= 1
            else:
                A2[keytuple][second_word]+= 1

In [41]:
# Matrices as probabilities

# FOR pi
def calc_pi_probs(dict_):
    #1- Not using log probabilities. We need real probabilities
    #2- No smoothing (+1). Model will not use word combinations 
    # that never appeared in training test. 
    new_pi = {}
    total_count = 0
    for val in dict_.values():
        total_count += val

    for key, value in dict_.items():
        new_pi[key] = (value)/(total_count)
    return new_pi

# FOR A
def calc_A_probsprobabilities(dict_):
    new_A = {}
    
    for key in dict_.keys():
        new_pi = calc_pi_probs(dict_[key])
        new_A[key] = new_pi
    
    return new_A

In [42]:
A2_probs = calc_A_probsprobabilities(A2)

In [43]:
#article = df['text_list_int'][0]
#count = 0
#for i in range(1,len(article)-2):
#    
#    new_article = article.copy()
#    first_word = article[i]
#    second_word= article[i+1]
#    third_word = article[i+2]
#    search_key = (first_word, third_word)
#
#    if search_key in A2_probs:
#        new_second_word = int(np.random.choice([*A2_probs[search_key].keys()],p=[*A2_probs[search_key].values()]))
#        if second_word!=new_second_word:
#            count+=1
#        new_article[i+1] = second_word
#print(f"Number of words spinned: {count}")

In [47]:
def article_spinner(articleidx):

    article = df['text_list_int'][articleidx]
    count = 0
    new_article = article.copy()
    for i in range(len(article)-2):

        first_word = article[i]
        second_word= article[i+1]
        third_word = article[i+2]
        search_key = (first_word, third_word)

        if search_key in A2_probs:
            new_second_word = int(np.random.choice([*A2_probs[search_key].keys()],p=[*A2_probs[search_key].values()]))
            if second_word!=new_second_word:
                count+=1
            new_article[i+1] = new_second_word
    print(f"Number of words spinned: {count}")

    print("======ORIGINAL TEXT======")
    print(idx2word_generator(article))
    print("======SPINNED  TEXT======")
    print(idx2word_generator(new_article))


#def idx2word_generator(_text_list_int):
#    text = []
#    for i in range(len(_text_list_int)):
#        text.append(idx2word[_text_list_int[i]])
#    text = ' '.join(text)
#    return text

def idx2word_generator(_text_list_int):
    text_list = [idx2word[i] for i in _text_list_int]
    detokenizer = TreebankWordDetokenizer()
    return detokenizer.detokenize(text_list)

In [48]:
article_spinner(500)

Number of words spinned: 202
water firm suez in argentina row a conflict between the argentine state and water firm aguas argentinas, controlled by france's suez, is casting doubt on the firm's future . the firm, which serves the province of buenos aires, wants a tariff rise of 60% to fund water-supply improvements . the government has rejected the 60% rise and wants aguas argentinas to make an annual investment of 400m pesos ($136m; £72.3m) in improvements . planning minister julio de vido has offered state help but not for "free". mr de vido said that the argentine state would not make a contribution "in the form of a subsidy". he has said a contribution could be made in return for a seat on the company's board . he added that the government is in discussions with aguas argentinas about what role it might take in the event that a state contribution is agreed . however, aguas argentinas told the argentine newspaper clarin it would not accept any change to its legal structure and, in p

In [49]:
article_spinner(100)

Number of words spinned: 137
australia rates at four year high australia is raising its benchmark interest rate to its highest level in four years despite signs of a slowdown in the country's economy . the reserve bank of australia lifted interest rates 0.25% to 5.5%, their first upwards move in more than a year . however, shortly after the bank made its decision, new figures showed a fall in economic growth in the last quarter . the bank said it had acted to curb inflation but the move was criticised by some analysts . the rate hike was the first since december 2003 and had been well-flagged in advance . however, opposition parties and some analysts said the move was ill-timed given data showing the australian economy grew just 0.1% between october and december and 1.5% on an annual basis . the figures, representing a decline from the 0.2% growth in gdp seen between july and september, were below market expectations . consumer spending remains strong, however, and the bank is concerne