In [1]:
import numpy as np
import pandas as pd
import os
import csv
import string
from nltk import pos_tag, word_tokenize

np.random.seed(10)

In [2]:
initial = {}
first_order = {} 
second_order = {} 

In [3]:
def add2dict(d, k, v):
  if k not in d:
    d[k] = []
  d[k].append(v)

# [cat, cat, dog, dog, dog, dog, dog, mouse, ...]

In [4]:
path='C:/Mydata/natural_language_processing/bbc/business'
tokens=[]
for filename in os.listdir(path):
    file=[]
    for line in open(path+"/"+filename, encoding='utf-8'):
        file.append(line.rstrip().lower().split())
    tokens.append(file)

In [5]:
test_file_1_index = 1
test_file_2_index = 2
test_file_1= tokens[test_file_1_index]
test_file_2= tokens[test_file_2_index]
print(len(tokens))
del tokens[test_file_1_index],tokens[test_file_2_index]
print(len(tokens))

510
508


In [6]:
# Remove empty lists [] from tokens

filtered_tokens=[]
for index, token in enumerate(tokens):
    filtered_list = [item for item in token if item]
    filtered_tokens.append(filtered_list)

In [7]:
filtered_tokens[0][0]

['ad', 'sales', 'boost', 'time', 'warner', 'profit']

In [8]:
for doc_idx, doc in enumerate(filtered_tokens):
    for sentence_idx, sentence in enumerate(doc):
        for word_idx, word in enumerate(sentence):
            t = word
            initial[t] = initial.get(t, 0.) + 1
            if word_idx==0:
                t_p1 = sentence[word_idx+1]
                add2dict(first_order, t, t_p1)
            elif word_idx==len(sentence)-1:
                t_m1 = sentence[word_idx-1]
                add2dict(first_order, t_m1, t)
            else:
                t_m1 = sentence[word_idx-1]
                t_p1 = sentence[word_idx+1]
                add2dict(second_order, (t_m1, t_p1), t)

In [9]:
# normalize the distributions
initial_total = sum(initial.values())
for t, c in initial.items():
    initial[t] = c / initial_total

In [10]:
# convert [cat, cat, cat, dog, dog, dog, dog, mouse, ...]
# into {cat: 0.5, dog: 0.4, mouse: 0.1}

def list2pdict(ts):
  # turn each list of possibilities into a dictionary of probabilities
  d = {}
  n = len(ts)
  for t in ts:
    d[t] = d.get(t, 0.) + 1
  for t, c in d.items():
    d[t] = c / n
  return d

In [11]:
for t_1, ts in first_order.items():
  first_order[t_1] = list2pdict(ts)
for k, ts in second_order.items():
  second_order[k] = list2pdict(ts)

In [12]:
second_order

{('ad', 'boost'): {'sales': 1.0},
 ('sales', 'time'): {'boost': 1.0},
 ('boost', 'warner'): {'time': 1.0},
 ('time', 'profit'): {'warner': 1.0},
 ('quarterly', 'at'): {'profits': 1.0},
 ('profits', 'us'): {'at': 1.0},
 ('at', 'media'): {'us': 1.0},
 ('us', 'giant'): {'media': 0.1,
  'telecoms': 0.1,
  'banking': 0.2,
  'foods': 0.1,
  'retail': 0.1,
  'oil': 0.2,
  'mortgage': 0.1,
  'agrochemical': 0.1},
 ('media', 'timewarner'): {'giant': 1.0},
 ('giant', 'jumped'): {'timewarner': 1.0},
 ('timewarner', '76%'): {'jumped': 1.0},
 ('jumped', 'to'): {'76%': 0.5, '22%': 0.5},
 ('76%', '$1.13bn'): {'to': 1.0},
 ('to', '(£600m)'): {'$1.13bn': 1.0},
 ('$1.13bn', 'for'): {'(£600m)': 1.0},
 ('(£600m)', 'the'): {'for': 1.0},
 ('for', 'three'): {'the': 1.0},
 ('the', 'months'): {'three': 0.6206896551724138,
  '12': 0.1724137931034483,
  'six': 0.06896551724137931,
  'early': 0.034482758620689655,
  'last': 0.034482758620689655,
  'busiest': 0.034482758620689655,
  'five': 0.034482758620689655},


In [21]:
first_order[('going')]

{'abroad,': 1.0}

In [14]:
def sample_word(d):
  p0 = np.random.random()
  cumulative = 0
  for t, p in d.items():
    cumulative += p
    if p0 < cumulative:
      return t
  assert(False) # should never get here

In [18]:
def spin_sentence(old_sentence_list):
    sentence_list=old_sentence_list
    if sentence_list!=[]:
        for word_idx, word in enumerate(sentence_list):
            if word_idx!=0 and word_idx!=len(sentence_list)-1:
                t_m1 = sentence_list[word_idx-1]
                t_p1 = sentence_list[word_idx+1]
                key = (t_m1, t_p1)
                p_dist = second_order[key]
                # Alter 40% of changeable words
                if len(p_dist)>1 and np.random.random()<0.40:
                    new_word = sample_word(p_dist)
                    sentence_list[word_idx] = new_word
    return sentence_list

In [20]:
doc_idx=1
sen_idx=3
print(filtered_tokens[doc_idx][sen_idx])
#spin_sentence(filtered_tokens[doc_idx][sen_idx])

['rosneft', 'officials', 'were', 'unavailable', 'for', 'comment.', 'but', 'the', 'company', 'has', 'said', 'it', 'intends', 'to', 'take', 'action', 'against', 'menatep', 'to', 'recover', 'some', 'of', 'the', 'tax', 'claims', 'and', 'debts', 'owed', 'by', 'yugansk.', 'yukos', 'had', 'filed', 'for', 'bankruptcy', 'protection', 'in', 'a', 'us', 'court', 'in', 'an', 'attempt', 'to', 'prevent', 'the', 'forced', 'sale', 'of', 'its', 'main', 'production', 'arm.', 'the', 'sale', 'went', 'ahead', 'in', 'december', 'and', 'yugansk', 'was', 'sold', 'to', 'a', 'little-known', 'shell', 'company', 'which', 'in', 'turn', 'was', 'bought', 'by', 'rosneft.', 'yukos', 'claims', 'its', 'downfall', 'was', 'punishment', 'for', 'the', 'political', 'ambitions', 'of', 'its', 'founder', 'mikhail', 'khodorkovsky', 'and', 'has', 'vowed', 'to', 'sue', 'any', 'participant', 'in', 'the', 'sale.']


In [22]:
filtered_tokens[doc_idx][sen_idx]

['rosneft',
 'officials',
 'were',
 'unavailable',
 'for',
 'comment.',
 'but',
 'the',
 'company',
 'has',
 'said',
 'it',
 'intends',
 'to',
 'take',
 'action',
 'against',
 'menatep',
 'to',
 'recover',
 'some',
 'of',
 'the',
 'tax',
 'claims',
 'and',
 'debts',
 'owed',
 'by',
 'yugansk.',
 'yukos',
 'had',
 'filed',
 'for',
 'bankruptcy',
 'protection',
 'in',
 'a',
 'us',
 'court',
 'in',
 'an',
 'attempt',
 'to',
 'prevent',
 'the',
 'forced',
 'sale',
 'of',
 'its',
 'main',
 'production',
 'arm.',
 'the',
 'sale',
 'went',
 'ahead',
 'in',
 'december',
 'and',
 'yugansk',
 'was',
 'sold',
 'to',
 'a',
 'little-known',
 'shell',
 'company',
 'which',
 'in',
 'turn',
 'was',
 'bought',
 'by',
 'rosneft.',
 'yukos',
 'claims',
 'its',
 'downfall',
 'was',
 'punishment',
 'for',
 'the',
 'political',
 'ambitions',
 'of',
 'its',
 'founder',
 'mikhail',
 'khodorkovsky',
 'and',
 'has',
 'vowed',
 'to',
 'sue',
 'any',
 'participant',
 'in',
 'the',
 'sale.']

In [53]:
p0 = np.random.random()
print("p0 = ",p0)
cumulative = 0
for t1, p1 in first_order[t].items():
    if p1==1.0:
        print("word = ",t1)
        break
    cumulative += p
    if p0 < cumulative:
        print("cumulative = ",cumulative)
        print("word = ",t1)
        break

p0 =  0.5425443680112613
word =  cards


In [52]:
for t1, p1 in first_order[t].items():
    print(p1)

1.0


In [54]:
p0 = np.random.random()
print("p0 = ",p0)
cumulative = 0
for t2, p2 in second_order[t,t1].items():
    if p2==1.0:
        print("word = ",t2)
        break
    cumulative += p
    if p0 < cumulative:
        print("cumulative = ",cumulative)
        print("word = ",t2)
        break

p0 =  0.14217004760152696
word =  one


In [96]:
def generator(initial_d, first_d, second_d, max_words):
    words=[]
    counter=0
    current_word=' '
    
    word1=sample_word(initial_d)
    words.append(word1)
    word2=sample_word(first_d[word1])
    words.append(word2)
    while(word2!="END" and counter<max_words-2):
        current_word = word2
        word2 = sample_word(second_d[word1,word2])
        word1 = current_word
        words.append(word2)
        counter+=1
    # Create a new list with 'END' replaced by '.'
    new_list_of_words = ['.' if word == 'END' else word for word in words]
    sentence = ' '.join(new_list_of_words)
    print(sentence)

In [103]:
for generations in range(4):
    max_words = np.random.choice([5,12])
    generator(initial,first_order,second_order,max_words)

i have looked as if
with his pipe in his mouth and his brown jug .
though not yet all gone
halting perplexed behind the mountains


In [100]:
def generate():
  for i in range(4): # generate 4 lines
    sentence = []

    # initial word
    w0 = sample_word(initial)
    sentence.append(w0)

    # sample second word
    w1 = sample_word(first_order[w0])
    sentence.append(w1)

    # second-order transitions until END
    while True:
      w2 = sample_word(second_order[(w0, w1)])
      if w2 == 'END':
        break
      sentence.append(w2)
      w0 = w1
      w1 = w2
    print(' '.join(sentence))

In [101]:
generate()

and not melt snow or start a dormant tree
like some men folk no ones afraid of fire
into each other i know the country and now the very air of what i think ill get away
and i
