In [None]:
class PredictionTree():
    
    Item = None
    Parent = None
    Children = None
    
    def __init__(self,itemValue=None):
        self.Item = itemValue
        self.Children = []
        self.Parent = None
        
    def addChild(self, child):
        newchild = PredictionTree(child)
        newchild.Parent = self
        self.Children.append(newchild)
        
    def getChild(self,target):
        for chld in self.Children:
            if chld.Item == target:
                return chld
        return None
    
    def getChildren(self):
        return self.Children
        
    def hasChild(self,target):
        found = self.getChild(target)
        if found is not None:
            return True
        else:
            return False
        
    def removeChild(self,child):
        for chld in self.Children:
            if chld.Item==child:
                self.Children.remove(chld)

In [None]:
import os
import json

class CPT():

    #alphabet = None # A set of all unique items in the entire data file
    #root = None # Root node of the Prediction Tree
    II = None # Inverted Index dictionary, where key : unique item, value : set of sequences containing this item
    LT = None # A Lookup table dictionary, where key : id of a sequence(row), value: leaf node of a Prediction Tree

    def __init__(self):
        #self.alphabet = set()
        #self.root = PredictionTree()
        self.II = {}
        self.LT = {}

    def load_files(self, train_dir, test_dir = None):

        """
        seq1 = A,B,C,D
        seq2  B,C,E

        Returns: [[A,B,C,D],[B,C,E]]
        """
        
        data = [] # List of list containing the entire sequence data using which the model will be trained
        target = [] # List of list containing the test sequences whose next n items are to be predicted
        seqid = 0
        
        for path in os.listdir(train_dir):
            with open(os.path.join(train_dir,path)) as train_file:
                training_json = json.loads(train_file.read())
                for token, next_tokens in training_json[0]["References"].items():
                    for next_token, properties in next_tokens.items():
                        data.append([token, next_token, seqid])
                        self.LT[seqid] = next_token #NEW
                        seqid += 1 #NEW
        
        for path in os.listdir(test_dir):
            with open(os.path.join(test_dir,path)) as test_file:
                test_json = json.loads(test_file.read())
                for token, next_tokens in test_json[0]["References"].items():
                    for next_token, properties in next_tokens.items():
                        if next_token == "__TOKEN_TO_PREDICT__":
                            target.append([path, token])
                        else: 
                            data.append([token, next_token, seqid])
                            self.LT[seqid] = next_token #NEW
                            seqid += 1 #NEW
                            
        
        #print("Data is", data)
        #print("Target is", target)
        
        return data, target

    def train(self, data):

        """
        This functions populates the Prediction Tree, Inverted Index and LookUp Table for the algorithm.

        Input: The list of list training data
        Output : Boolean True
        """
        
        """
        cursornode = self.root
        
        for seqid, row in enumerate(data):
            for element in row:

                # Adding to the Prediction Tree

                if cursornode.hasChild(element) == False:
                    cursornode.addChild(element)
                    cursornode = cursornode.getChild(element)
                else:
                    cursornode = cursornode.getChild(element)

                # Adding to the Inverted Index

                if self.II.get(element) is None:
                    self.II[element] = set()

                self.II[element].add(seqid)
                
                self.alphabet.add(element)

            # Adding to the LookUp table
            
            self.LT[seqid] = cursornode

            cursornode = self.root
        """
        
        for row in data:
            for element in row[:-1]:
                if self.II.get(element) is None:
                    self.II[element] = set()
            
                self.II[element].add(row[2])
            
        return True

    def score(self, counttable, key, number_of_similar_sequences, number_items_counttable):

        """
        This function is the main workhorse and calculates the score to be populated against an item. Items are predicted
        using this score.

        Output: Returns a counttable dictionary which stores the score against items. This counttable is specific for a 
        particular row or a sequence and therefore re-calculated at each prediction.
        """

        weight_level = 1/number_of_similar_sequences
        weight_distance = 1/(number_items_counttable+1)
        score = 1 + weight_level + weight_distance * 0.001
        
        if counttable.get(key) is None:
            counttable[key] = score
        else:
            counttable[key] = (score * counttable.get(key))
            
        return counttable

    def predict(self, target, n): 
        
        """
        Here target is the test dataset in the form of list of list and
        n is the number of predictions required.

        Input: target list of list, n

        Output: list of [[NameOfFile,pred1,pred2,pred3,pred4,pred5], ...]
        """
        
        predictions = []
        
        for num, sequence in enumerate(target):
            similar_sequences = []
            consequents = []
            counttable = {}
            
            """
            for element in sequence:
                if self.II.get(element) is None:
                    continue
                    
                similar_sequences = list(self.II.get(element))
                
                for seq in similar_sequences:
                    consequents.append(self.LT.get(seq))
                    
                count = 0
                for consequent in consequents:
                    counttable = self.score(counttable,consequent,len(similar_sequences),count)
                    count += 1
                    
                pred = self.get_n_largest(counttable,n)
                predictions.append([sequence[0]] + pred)
            """
            
            if self.II.get(sequence[1]) is None:
                continue
                
            similar_sequences = list(self.II.get(sequence[1]))
            
            consequents = [self.LT.get(seq) for seq in similar_sequences]
                
            count = 0
            for consequent in consequents:
                counttable = self.score(counttable,consequent,len(similar_sequences),count)
                count += 1

            pred = self.get_n_largest(counttable,n)
            predictions.append([sequence[0]] + pred)
            print(num)

        return predictions

    def get_n_largest(self,dictionary,n):

        """
        A small utility to obtain top n keys of a Dictionary based on their values.

        """
        
        largest = sorted(dictionary.items(), key = lambda t: t[1], reverse=True)[:n]
        return [key for key,_ in largest]

In [None]:
import datetime
import csv

model = CPT()
print("Started LOADING files at:", str(datetime.datetime.now()))
#"C:\\Users\\t-dado\\Desktop\\TrainTokens","C:\\Users\\t-dado\\Desktop\\TestTokens"
#"C:\\Users\\t-dado\\Desktop\\CPT-Others\\TrainTokens_Test","C:\\Users\\t-dado\\Desktop\\CPT-Others\\TestTokens_Test"
data,target = model.load_files("C:\\Users\\t-dado\\Desktop\\CPT-Others\\TrainTokens_Test","C:\\Users\\t-dado\\Desktop\\CPT-Others\\TestTokens_Test")
print("Size of target:", len(target))
print("Finished LOADING files at:", str(datetime.datetime.now()))
print("Started TRAINING at:", str(datetime.datetime.now()))
model.train(data)
print("Finished TRAINING at:", str(datetime.datetime.now()))
print("Started PREDICTING at:", str(datetime.datetime.now()))
predictions = model.predict(target,5)
print("Finished PREDICTING at:", str(datetime.datetime.now()))

"""
with open("predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(predictions)
"""