In [112]:
import pandas as pd
import re, math, nltk

In [113]:
def load():
    return pd.read_excel('DATA.xlsx', index_col = None)

In [125]:
'''
This method simply looks for the word "discount" and determines if a range or exact figure is given. 
'''
def baselineOne():
    def process(document):
        return nltk.sent_tokenize(document)
    
    def search(word):
        newList = []
        for s in word: 
            if re.match("^\d+?\.\d+?$", s) is not None:
                newList.append(s)
        if len(newList) == 0:
            return "No discount rate found!" 
        elif len(newList) == 1:
            return "The discount rate used is " + newList[0] + "."
        else:
            return "The discount rate ranged from " + newList[0] + " to " + newList[1] + "."
        
    # Start of main code
    df = load()['Text']
    for document in df:
        if not document != document:
            doc = process(document)
            for sentence in doc: 
                w = nltk.word_tokenize(sentence)
                if "discount" in w:
                    idx = w.index("discount")
                    r = search(w[idx + 1 : idx + 11])
                    print(r)
baselineOne()

The discount rate ranged from 7.00 to 8.00.
No discount rate found!
The discount rate used is 8.5.
No discount rate found!
The discount rate ranged from 6.75 to 7.75.
No discount rate found!
The discount rate used is 8.5.
No discount rate found!
The discount rate used is 8.5.
No discount rate found!
The discount rate ranged from 10.5 to 12.5.
No discount rate found!
The discount rate used is 7.75.
No discount rate found!
No discount rate found!
The discount rate ranged from 7.25 to 8.25.
No discount rate found!
The discount rate ranged from 7.0 to 8.0.
The discount rate ranged from 7.25 to 8.25.
The discount rate ranged from 7.00 to 8.00.
No discount rate found!
The discount rate ranged from 8.0 to 10.0.
No discount rate found!
The discount rate ranged from 9.0 to 10.0.
No discount rate found!
The discount rate ranged from 6.00 to 7.50.
No discount rate found!
No discount rate found!
The discount rate ranged from 11.0 to 13.0.
No discount rate found!
The discount rate ranged from 11.0 

In [133]:
'''
This method implements window classification. 
'''
def SGD(data, learning, numepoch, numFeature):
    def predict(f, coeff): 
        ttl = sum([a * b for a, b in zip(f, coeff)])
        return 1.0 / (1.0 + math.exp(-ttl))
            
    coeff = [0 for _ in range(numFeature)]
    for epoch in range(numepoch): 
        err_ttl = 0
        for d in data: 
            f, c = d
            p = predict(f, coeff)
            err = (p - c)
            err_ttl += (p - c) ** 2
            for i in range(len(f)): 
                coeff[i] -= learning * err * p * (1 - p) * f[i]
    return coeff

def createData(data, features): 
    def process(document):
        return nltk.word_tokenize(document)
    
    def genData(words, win, data, mapping, docNum, count):
        for idx, w in enumerate(words):
            if re.match("[+-]?\d*\.\d+|\d+", w) is not None:
                data.append((words[max(0, idx - win): min(len(words), idx + win + 1)], 0))
                mapping[count] = docNum
                count += 1
        return count
                
    def isfloat(s): #feature
        if re.match("^\d+?\.\d+?$", s) is None:
            return False
        return True
    
    def featurize(val, features):
        w = (len(val) - 1) // 2
        arr = [0 for _ in range(len(features))]
        arr[0] = 1
        for idx, f in enumerate(features): 
            if idx == 1: 
                if val[w + 1] == '%': 
                    arr[idx] = 1
            elif idx == 2: 
                if isfloat(val[w]) == True: 
                    v = float(val[w])
                    if v < 0 or v > 100: 
                        arr[idx] = int(200)
                    else: 
                        t = (v // 10) + 1
                        arr[idx] = int(t ** 2)
            elif idx == 3: 
                if 'discount' in val:
                    arr[idx] = 1
            elif idx == 4: 
                if 'rate' in val or 'rates' in val:
                    arr[idx] = 1
            elif idx == 5: 
                if 'percentage' in val or 'percentages' in val:
                    arr[idx] = 1
            elif idx == 6: 
                if val[w - 1] == '$':
                    arr[idx] = 1
        return arr
    
    # Start of main code
    processed = []
    mapping, count = {}, 0 # maps data number to docoument
    for docNum, document in enumerate(data):
        if not document != document:
            doc = process(document)
            count = genData(doc, 5, processed, mapping, docNum, count)
    
    # Creates labels for the values
    index = set([59, 88, 115, 173, 228, 240, 247, 439, 539, 707, 
                 769, 970, 1056, 1196, 1198, 1232, 1234, 1293, 1307,
                1367, 1369, 1402, 1404]) # This is manual
    featurized = []
    for idx, itm in enumerate(processed): 
        val, classification = itm
        # print(idx, val, classification)
        r = featurize(val, features)
        if idx in index:
            featurized.append((r, 1))
        else:
            featurized.append((r, 0))
    return featurized, processed, mapping

def testing(data, coeff, offset, mapping, backup):
    def search(word):
        newList = []
        for s in word: 
            if re.match("[+-]?\d*\.\d+|\d+", s) is not None:
                newList.append(s)
        if len(newList) == 0:
            return "No discount rate found!" 
        elif len(newList) == 1:
            return "The discount rate used is " + newList[0] + "."
        else:
            return "The discount rate ranged from " + newList[0] + " to " + newList[1] + "."
    ttlerr = 0
    falsepositive, falsenegative, truepositive, truenegative =0, 0, 0, 0
    for idx, d in enumerate(data): 
        val, y = d
        ttl = sum([a * b for a, b in zip(val, coeff)])
        p = 1.0 / (1.0 + math.exp(-ttl))
        err = (p - y) **2
        ttlerr += (p - y) ** 2
        if y == 1 and p > 0.5: 
            e = idx + offset #entry Num
            print("Entry " + str(e) + " and "+ "document " + str(mapping[e]) + " contains discount rates.")
            print(search(backup[e][0]))
        if y == 1: 
            if p >= 0.5:
                truepositive += 1
            else:
                falsenegative += 1
        if y == 0: 
            if p < 0.5:
                truenegative += 1
            else:
                falsepositive += 1
    return ttlerr, truepositive, truenegative, falsepositive, falsenegative
    
def main():
    df = load()['Text']
    features = ['Bias', # range
                'Next character == %?', # boolean 
                'Character = entirely float? If so, within reasonable limits?', # range 
                'Contains the word discount', # boolean 
                'Contains the word rate or rates', # boolean 
                'Contains the word percentage or percentages', # boolean 
                'Contains the $ sign before'] # boolean 
    d, backup, mapping = createData(df, features) 
    limit = 600
    train, test = d[ : limit], d[limit : len(d)]
    coeff = SGD(train, 0.005, 1000, len(features))
    e, tp, tn, fp, fn = testing(test, coeff, limit, mapping, backup)
    print(e, tp, tn, fp, fn)
main()

Entry 707 and document 14 contains discount rates.
The discount rate ranged from 7.00 to 8.00.
Entry 769 and document 18 contains discount rates.
The discount rate ranged from 8.0 to 10.0.
Entry 970 and document 23 contains discount rates.
The discount rate ranged from 9.0 to 10.0.
Entry 1056 and document 26 contains discount rates.
The discount rate ranged from 6.00 to 7.50.
Entry 1196 and document 28 contains discount rates.
The discount rate ranged from 11.0 to 13.0.
Entry 1198 and document 28 contains discount rates.
The discount rate used is 13.
Entry 1232 and document 29 contains discount rates.
The discount rate ranged from 11.0 to 13.0.
Entry 1234 and document 29 contains discount rates.
The discount rate used is 13.
Entry 1293 and document 30 contains discount rates.
The discount rate ranged from 13.0 to 144.
Entry 1307 and document 30 contains discount rates.
The discount rate used is 13.0.
Entry 1367 and document 32 contains discount rates.
The discount rate ranged from 11.0