In [5]:
import pandas as pd
import os
import random
import re
import nltk
import gensim
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora
from gensim.models import LdaMulticore
from collections import defaultdict
import copy as cp

### Data Prep

In [9]:
#import test files for topic modelling, will need many more
path = 'G:/My Documents/EAGER/FirstPages/' #where are files stored
files = os.listdir(path)
files = [item for item in files if item.endswith('.txt')]

In [10]:
#generate pooled text and list of processed documents for topic model
combined_text = []
documents = []
for i in range(len(files)):
    with open(path + files[i], 'r') as my_file:
        text = my_file.readlines() #this is each separate paragraph including headers etc
        for i in ["\n", "\t", " "]: #remove the end of sentence tags
            text = [item.strip(i) for item in text]
        text = filter(lambda x : not x in [' ', '', '\t'],text) #remove the paragraphs that are just blank space
        #loop over text and add title elements to the paragraph they describe
        joined = []
        join = False
        for i in range(len(text)):
            if len(text[i]) < 40:
                join = True
            else:
                if join:
                    joined.append(text[i-1] + " " + text[i])
                else:
                    joined.append(text[i])
        combined_text +=joined # make a big list
        documents.append(joined) #make a list of lists

In [11]:
#tokenize text for topic modelling
def encode_item(text):
    clean = ""
    for item in text.split(" "):
        try:
            clean += str(item).encode('ascii','ignore') + " "
        except: #just skip things we can't encode for now
            pass
    return clean.rstrip()
stopwords = nltk.corpus.stopwords.words('english') + list(set(string.punctuation))
stemmer = SnowballStemmer("english")
def stem_tokenizer(text):
    words = [word for word in nltk.word_tokenize(text)]
    words_no_stop = [word for word in words if not word in stopwords]
    words_stemmed = [stemmer.stem(word) for word in words_no_stop]
    return words_stemmed
def return_for_model(text):
    all_content = [encode_item(item) for item in text]
    all_content_stem_token = map(stem_tokenizer, all_content)
    return all_content_stem_token
all_tokenized = return_for_model(combined_text)
by_document = [return_for_model(doc) for doc in documents]

In [12]:
random.seed(1)
#create topic model on full text
#will need to run on server when we have more data
full_dict = corpora.Dictionary(all_tokenized)
DT_matrix = [full_dict.doc2bow(doc) for doc in all_tokenized]
lda = LdaMulticore(DT_matrix, id2word=full_dict, num_topics=4) 
lda.print_topics(num_topics=4, num_words=10)

[(0,
  u'0.009*"use" + 0.007*"nram" + 0.006*"materi" + 0.006*"heat" + 0.006*"technolog" + 0.006*"thermoelectr" + 0.006*"electr" + 0.005*"teg" + 0.005*"nantero" + 0.005*"industri"'),
 (1,
  u'0.022*"thermoelectr" + 0.018*"materi" + 0.014*"technolog" + 0.009*"power" + 0.009*"heat" + 0.008*"product" + 0.008*"teg" + 0.008*"develop" + 0.008*"use" + 0.008*"applic"'),
 (2,
  u'0.021*"nanotub" + 0.018*"technolog" + 0.010*"product" + 0.009*"applic" + 0.009*"carbon" + 0.008*"use" + 0.008*"the" + 0.007*"develop" + 0.007*"electr" + 0.006*"catalysi"'),
 (3,
  u'0.013*"heat" + 0.012*"teg" + 0.012*"technolog" + 0.011*"product" + 0.010*"applic" + 0.010*"wast" + 0.010*"power" + 0.008*"temperatur" + 0.007*"system" + 0.006*"the"')]

In [13]:
#identify topics by paragraph
topic_vector = []
for doc in by_document:
    main_topic = []
    #later possibly expand to take into account topic probabilities
    for para in doc:
        topic_by_prob = sorted([(i[1],i[0]) for i in lda.get_document_topics(full_dict.doc2bow(para))], reverse = True)
        main_topic.append(topic_by_prob[0][1])
        topic_prob = topic_by_prob[0][0] #in case we want it later
    topic_vector.append(main_topic)
#add start and end keys for topic_vector
for doc in topic_vector:
    doc.insert(0, "start")
    doc.append("end")

In [14]:
topic_vector[1]

['start', 1, 0, 2, 1, 1, 1, 3, 3, 1, 3, 2, 1, 1, 0, 3, 3, 0, 3, 0, 0, 'end']

### Markov Model

In [15]:
from collections import defaultdict
import random

In [21]:
class markov_representation():
    def __init__(self):
        self.tokens = defaultdict(lambda: 0)
        self.transitions = defaultdict(lambda: defaultdict(lambda:0))
        self.for_matrix = defaultdict(lambda: 0)
    def add(self, list_of_tokens):
        for i in range(len(list_of_tokens) ):
            if i <= (len(list_of_tokens)-2):
                self.tokens[str(list_of_tokens[i])]+=1
                self.transitions[str(list_of_tokens[i])][str(list_of_tokens[i + 1])]+=1
                self.for_matrix[(str(list_of_tokens[i]),str(list_of_tokens[i + 1]))] +=1
            else: #separate block to add the last token to the list of tokens
                self.tokens[str(list_of_tokens[i])]+=1
                self.transitions[str(list_of_tokens[i])][None] +=1
    def test(self):
        #return self.transitions
        return self.for_matrix
    #to do: add transition matrix
    def make_transition_matrix(self):
        #tranform dictionary of tuples in self.for_matrix into matrix
        pass
    def generate_sequence(self):
        topic = 'start'
        document = ['start']
        while topic != 'end' and len(document)<20:
            options = self.transitions[topic]
            total = sum(options.values())
            temp = 0
            threshold = random.randint(0, total-1)
            for key, value in options.iteritems():
                temp += value
                if temp>threshold:
                    topic = key
                    break
            document.append(topic)
        if document[-1] != 'end':
            document.append('end')
        return document
        #I want to recursively call this to generate a sequence of the required length, but pass on this for now
#         if len(document) > min_length:
#             return document
#         else:
#             return self.generate_sequence(self, 5)

### Apply Model

In [23]:
model = markov_representation()
for doc in topic_vector:
    model.add(doc)
model.generate_sequence()

['start', '2', '1', '1', '0', 'end']

In [24]:
model.test()

defaultdict(<function __main__.<lambda>>,
            {('0', '0'): 7,
             ('0', '1'): 4,
             ('0', '2'): 4,
             ('0', '3'): 5,
             ('0', 'end'): 2,
             ('1', '0'): 9,
             ('1', '1'): 25,
             ('1', '2'): 4,
             ('1', '3'): 5,
             ('1', 'end'): 4,
             ('2', '0'): 1,
             ('2', '1'): 8,
             ('2', '2'): 17,
             ('2', '3'): 4,
             ('2', 'end'): 3,
             ('3', '0'): 5,
             ('3', '1'): 5,
             ('3', '2'): 4,
             ('3', '3'): 11,
             ('3', 'end'): 1,
             ('start', '1'): 5,
             ('start', '2'): 4,
             ('start', '3'): 1})