In [15]:
import pandas as pd
import numpy as np
import re

In [16]:
df = pd.read_csv(f"../data/bbc_text_cls.csv")
print(df['labels'].value_counts())
df = df[df['labels'] == 'business']
print(df['labels'].value_counts())

labels
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64
labels
business    510
Name: count, dtype: int64


In [17]:
df.drop(columns='labels', inplace=True)
df.head()

Unnamed: 0,text
0,Ad sales boost Time Warner profit\n\nQuarterly...
1,Dollar gains on Greenspan speech\n\nThe dollar...
2,Yukos unit buyer faces loan claim\n\nThe owner...
3,High fuel prices hit BA's profits\n\nBritish A...
4,Pernod takeover talk lifts Domecq\n\nShares in...


In [18]:
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace(r'\n', ' ', regex=True)
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
df['text'] = df['text'].str.replace(r'[\d]', '', regex=True)
df['text_list'] = df['text'].apply(lambda x: x.split())
df.head()

Unnamed: 0,text,text_list
0,ad sales boost time warner profit quarterly p...,"[ad, sales, boost, time, warner, profit, quart..."
1,dollar gains on greenspan speech the dollar h...,"[dollar, gains, on, greenspan, speech, the, do..."
2,yukos unit buyer faces loan claim the owners ...,"[yukos, unit, buyer, faces, loan, claim, the, ..."
3,high fuel prices hit bas profits british airw...,"[high, fuel, prices, hit, bas, profits, britis..."
4,pernod takeover talk lifts domecq shares in u...,"[pernod, takeover, talk, lifts, domecq, shares..."


In [19]:
#unique word list
word_set = set()

for text in df['text_list']:
    word_set.update(text)
idx2word = list(word_set)
idx2word.insert(0, '<UNKNOWN>')

word2idx = {word:i for i, word in enumerate(idx2word)}
print(len(word2idx))
print(list(word2idx.items())[0:5])

11963
[('<UNKNOWN>', 0), ('knowledgebased', 1), ('roubles', 2), ('limited', 3), ('increases', 4)]


In [20]:
def word_numerizer(arr):
    return [word2idx.get(word, 0) for word in arr]

In [21]:
df['text_list_int'] = df['text_list'].apply(lambda x: word_numerizer(x))
df['text_list_int']

0      [7604, 9894, 158, 9391, 5725, 5877, 3650, 1966...
1      [11199, 6944, 6211, 9466, 8812, 47, 11199, 991...
2      [3985, 9762, 6263, 10552, 4553, 5293, 47, 3941...
3      [2052, 3053, 10743, 10712, 3008, 1966, 7819, 1...
4      [10784, 3396, 3582, 2511, 4742, 11313, 2468, 3...
                             ...                        
505    [9169, 2288, 272, 6672, 11776, 10702, 47, 9169...
506    [3139, 778, 8331, 7436, 5022, 10096, 47, 3139,...
507    [11945, 6877, 9282, 6211, 9169, 47, 4876, 1409...
508    [10082, 11712, 3439, 1691, 6211, 2524, 1367, 1...
509    [7369, 11875, 6674, 7060, 7086, 5780, 4002, 49...
Name: text_list_int, Length: 510, dtype: object

In [22]:
# Prepare A1, A2 and pi

A2 = {}

for sentence in df['text_list_int']: 
    #A2
    for word_index in range(len(sentence)-2):
        #key
        current_word = sentence[word_index]
        second_word  = sentence[word_index+1]
        third_word   = sentence[word_index+2]

        keytuple = (current_word, third_word)

        if keytuple not in A2:
            A2[keytuple] = {}
            A2[keytuple][second_word] = 1                    
        else:
            if second_word not in A2[keytuple]:
                A2[keytuple][second_word]= 1
            else:
                A2[keytuple][second_word]+= 1

In [23]:
# Matrices as probabilities

# FOR pi
def calc_pi_probs(dict_):
    #1- Not using log probabilities. We need real probabilities
    #2- No smoothing (+1). Model will not use word combinations 
    # that never appeared in training test. 
    new_pi = {}
    total_count = 0
    for val in dict_.values():
        total_count += val

    for key, value in dict_.items():
        new_pi[key] = (value)/(total_count)
    return new_pi

# FOR A
def calc_A_probsprobabilities(dict_):
    new_A = {}
    
    for key in dict_.keys():
        new_pi = calc_pi_probs(dict_[key])
        new_A[key] = new_pi
    
    return new_A

In [24]:
A2_probs = calc_A_probsprobabilities(A2)

In [None]:
def article_spinner(article):
    

In [26]:
A2

{(7604, 158): {9894: 1},
 (9894, 9391): {158: 1},
 (158, 5725): {9391: 1},
 (9391, 5877): {5725: 1},
 (5725, 3650): {5877: 1},
 (5877, 1966): {3650: 1},
 (3650, 349): {1966: 1},
 (1966, 7086): {349: 1},
 (349, 10932): {7086: 1},
 (7086, 5630): {10932: 1,
  9749: 1,
  6502: 2,
  4517: 1,
  11566: 1,
  766: 2,
  7308: 1,
  6568: 1},
 (10932, 8161): {5630: 1},
 (5630, 2377): {8161: 1},
 (8161, 9955): {2377: 1, 8025: 1},
 (2377, 11386): {9955: 2},
 (9955, 479): {11386: 2,
  5933: 10,
  11950: 1,
  11383: 1,
  2138: 2,
  11646: 1,
  11414: 2,
  198: 1,
  3038: 1,
  479: 7,
  2360: 3,
  2817: 1,
  7446: 2,
  585: 1,
  265: 1,
  5251: 1},
 (11386, 5426): {479: 2,
  11386: 6,
  6020: 1,
  1294: 2,
  5293: 1,
  6437: 1,
  8525: 1,
  9894: 1},
 (479, 47): {5426: 5,
  5002: 1,
  2468: 14,
  8676: 1,
  1481: 7,
  9667: 3,
  479: 3,
  8450: 1,
  6211: 2,
  5913: 1,
  1170: 1,
  6020: 1,
  10034: 2,
  11386: 1,
  11434: 1,
  349: 1,
  3956: 1,
  4838: 1,
  9955: 2,
  7060: 1,
  272: 1,
  3810: 1},
 

In [None]:
article = df['text_list_int'][0]
for i in range(1,len(article)-1):
    first_word = article[i]
    second_word = article[i+1]
    third_word = article[i+2]
    search_key = (first_word, third_word)
    if search_key

[7604,
 9894,
 158,
 9391,
 5725,
 5877,
 3650,
 1966,
 349,
 7086,
 10932,
 5630,
 8161,
 2377,
 9955,
 11386,
 479,
 5426,
 47,
 11378,
 1789,
 9955,
 2372,
 1481,
 479,
 5002,
 47,
 5780,
 8613,
 8025,
 6679,
 6108,
 272,
 47,
 2506,
 6258,
 2468,
 6558,
 7592,
 1481,
 9894,
 272,
 9419,
 967,
 6856,
 5406,
 3639,
 8933,
 9894,
 8161,
 6606,
 2477,
 6089,
 9894,
 5824,
 9955,
 11386,
 1481,
 11386,
 4005,
 1966,
 10428,
 10825,
 7060,
 2435,
 6944,
 8613,
 4620,
 11646,
 5877,
 2445,
 349,
 5725,
 9002,
 5406,
 2808,
 2745,
 5426,
 9580,
 9391,
 5725,
 6606,
 6211,
 1864,
 730,
 5946,
 6679,
 11872,
 272,
 6599,
 6558,
 10034,
 4005,
 1168,
 967,
 6212,
 9580,
 4434,
 9917,
 11120,
 6261,
 5946,
 5942,
 4699,
 2468,
 47,
 2477,
 6089,
 1966,
 10428,
 7549,
 1225,
 2468,
 47,
 9180,
 11378,
 6635,
 3763,
 47,
 1448,
 6606,
 9740,
 242,
 5877,
 10949,
 9908,
 7324,
 5824,
 6211,
 47,
 9637,
 272,
 6685,
 967,
 1596,
 4595,
 5946,
 6614,
 9955,
 11123,
 4699,
 7060,
 868,
 47,
 445,
 6