In [116]:
import pandas as pd
import numpy as np
import re

In [117]:
df = pd.read_csv(f"../data/bbc_text_cls.csv")
print(df['labels'].value_counts())
df = df[df['labels'] == 'business']
print(df['labels'].value_counts())

labels
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64
labels
business    510
Name: count, dtype: int64


In [118]:
df.drop(columns='labels', inplace=True)
df.head()

Unnamed: 0,text
0,Ad sales boost Time Warner profit\n\nQuarterly...
1,Dollar gains on Greenspan speech\n\nThe dollar...
2,Yukos unit buyer faces loan claim\n\nThe owner...
3,High fuel prices hit BA's profits\n\nBritish A...
4,Pernod takeover talk lifts Domecq\n\nShares in...


In [119]:
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace(r'\n', ' ', regex=True)
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
df['text'] = df['text'].str.replace(r'[\d]', '', regex=True)
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True)
df['text_list'] = df['text'].apply(lambda x: x.split())
df.head()

Unnamed: 0,text,text_list
0,ad sales boost time warner profit quarterly pr...,"[ad, sales, boost, time, warner, profit, quart..."
1,dollar gains on greenspan speech the dollar ha...,"[dollar, gains, on, greenspan, speech, the, do..."
2,yukos unit buyer faces loan claim the owners o...,"[yukos, unit, buyer, faces, loan, claim, the, ..."
3,high fuel prices hit bas profits british airwa...,"[high, fuel, prices, hit, bas, profits, britis..."
4,pernod takeover talk lifts domecq shares in uk...,"[pernod, takeover, talk, lifts, domecq, shares..."


In [120]:
#unique word list
word_set = set()

for text in df['text_list']:
    word_set.update(text)
idx2word = list(word_set)
idx2word.insert(0, '<UNKNOWN>')

word2idx = {word:i for i, word in enumerate(idx2word)}
print(len(word2idx))
print(list(word2idx.items())[0:5])

11963
[('<UNKNOWN>', 0), ('knowledgebased', 1), ('roubles', 2), ('limited', 3), ('increases', 4)]


In [121]:
def word_numerizer(arr):
    return [word2idx.get(word, 0) for word in arr]

In [122]:
df['text_list_int'] = df['text_list'].apply(lambda x: word_numerizer(x))
df['text_list_int']

0      [7604, 9894, 158, 9391, 5725, 5877, 3650, 1966...
1      [11199, 6944, 6211, 9466, 8812, 47, 11199, 991...
2      [3985, 9762, 6263, 10552, 4553, 5293, 47, 3941...
3      [2052, 3053, 10743, 10712, 3008, 1966, 7819, 1...
4      [10784, 3396, 3582, 2511, 4742, 11313, 2468, 3...
                             ...                        
505    [9169, 2288, 272, 6672, 11776, 10702, 47, 9169...
506    [3139, 778, 8331, 7436, 5022, 10096, 47, 3139,...
507    [11945, 6877, 9282, 6211, 9169, 47, 4876, 1409...
508    [10082, 11712, 3439, 1691, 6211, 2524, 1367, 1...
509    [7369, 11875, 6674, 7060, 7086, 5780, 4002, 49...
Name: text_list_int, Length: 510, dtype: object

In [123]:
# Prepare A1, A2 and pi

A2 = {}

for sentence in df['text_list_int']: 
    #A2
    for word_index in range(len(sentence)-2):
        #key
        current_word = sentence[word_index]
        second_word  = sentence[word_index+1]
        third_word   = sentence[word_index+2]

        keytuple = (current_word, third_word)

        if keytuple not in A2:
            A2[keytuple] = {}
            A2[keytuple][second_word] = 1                    
        else:
            if second_word not in A2[keytuple]:
                A2[keytuple][second_word]= 1
            else:
                A2[keytuple][second_word]+= 1

In [124]:
# Matrices as probabilities

# FOR pi
def calc_pi_probs(dict_):
    #1- Not using log probabilities. We need real probabilities
    #2- No smoothing (+1). Model will not use word combinations 
    # that never appeared in training test. 
    new_pi = {}
    total_count = 0
    for val in dict_.values():
        total_count += val

    for key, value in dict_.items():
        new_pi[key] = (value)/(total_count)
    return new_pi

# FOR A
def calc_A_probsprobabilities(dict_):
    new_A = {}
    
    for key in dict_.keys():
        new_pi = calc_pi_probs(dict_[key])
        new_A[key] = new_pi
    
    return new_A

In [125]:
A2_probs = calc_A_probsprobabilities(A2)

In [126]:
#article = df['text_list_int'][0]
#count = 0
#for i in range(1,len(article)-2):
#    
#    new_article = article.copy()
#    first_word = article[i]
#    second_word= article[i+1]
#    third_word = article[i+2]
#    search_key = (first_word, third_word)
#
#    if search_key in A2_probs:
#        new_second_word = int(np.random.choice([*A2_probs[search_key].keys()],p=[*A2_probs[search_key].values()]))
#        if second_word!=new_second_word:
#            count+=1
#        new_article[i+1] = second_word
#print(f"Number of words spinned: {count}")

In [159]:
def article_spinner(articleidx):

    article = df['text_list_int'][articleidx]
    count = 0
    new_article = article.copy()
    for i in range(len(article)-2):

        first_word = article[i]
        second_word= article[i+1]
        third_word = article[i+2]
        search_key = (first_word, third_word)

        if search_key in A2_probs:
            new_second_word = int(np.random.choice([*A2_probs[search_key].keys()],p=[*A2_probs[search_key].values()]))
            if second_word!=new_second_word:
                count+=1
            new_article[i+1] = new_second_word
    print(f"Number of words spinned: {count}")

    print("======ORIGINAL TEXT======")
    print(idx2word_generator(article))
    print("======SPINNED  TEXT======")
    print(idx2word_generator(new_article))


def idx2word_generator(_text_list_int):
    text = []
    for i in range(len(_text_list_int)):
        text.append(idx2word[_text_list_int[i]])
    text = ' '.join(text)
    return text

In [160]:
article_spinner(500)

Number of words spinned: 161
water firm suez in argentina row a conflict between the argentine state and water firm aguas argentinas controlled by frances suez is casting doubt on the firms future the firm which serves the province of buenos aires wants a tariff rise of to fund watersupply improvements the government has rejected the rise and wants aguas argentinas to make an annual investment of m pesos m m in improvements planning minister julio de vido has offered state help but not for free mr de vido said that the argentine state would not make a contribution in the form of a subsidy he has said a contribution could be made in return for a seat on the companys board he added that the government is in discussions with aguas argentinas about what role it might take in the event that a state contribution is agreed however aguas argentinas told the argentine newspaper clarin it would not accept any change to its legal structure and in practice this rules out state participation on its