## Create vectors for documents using MITIE NER and feature hashing

In [None]:
from openpyxl import Workbook, load_workbook
import json

wb = load_workbook(filename = 'GoogleNews_18July.xlsx')
sheet_list1 = wb['DocumentOrder']
names=[]
for i in range(1,173):
    s = 'A' + str(i)
    names.append(sheet_list1[s].value.encode(encoding='ascii',errors='ignore'))
print(names)

## Prepare document content, clean punctuations and tokenize

In [2]:
import os
import shutil
import nltk
from nltk.tokenize import TreebankWordTokenizer

names = []
doc_content = []

paths = ["/home/malaviyac/lda2vec/testdata/NewSet/", "/home/malaviyac/lda2vec/testdata/FinanceSet/",
        "/home/malaviyac/lda2vec/testdata/Education/","/home/malaviyac/lda2vec/testdata/SEER/", 
         "/home/malaviyac/lda2vec/testdata/Duplicates/"]

bad = ["“", "”","'","/",'\\','"',':','^','-','<','>',"&",'$','#','=','*','^L']
tokens = []

def clean(line):
    line  = ''.join(w for w in line if w not in bad)
    line  = ' '.join(w for w in line.split() if w!='^L')
    return line
    

for path in paths:
    if paths.index(path)<2:
        src_files = sorted(os.listdir(path),key = lambda name : (int)(name[7:name.find(".")]))
    else:
        src_files = sorted(os.listdir(path))
        
    for filename in src_files:
        with open(os.path.join(path, filename), 'r') as filedata:
            tokens = TreebankWordTokenizer().tokenize(filedata.read())
            doc_content.append("".join(clean(" ".join(tokens).replace("\n"," ").decode(encoding='ascii',errors='ignore'))))
            names.append(filename[:-4])

# print doc_content



## Create feature hashed vectors for top 5 entities in a document obtained from MITIE

In [None]:
import requests
import json
import hashlib
import operator

# doc_content = ["Frank Cuban is my name. What 'Frank Cuban' is this witchcraft David? Cheetah is my favorite animal. Hello, how are you! I have been thinking.","Mark is a great industrialist. Apple is a bad company."]
payload = {}
entity_indices = []
doc_matrix = []
j=0
d={}
for doc in doc_content:
    ent = []
    
    doc_vec = [0]*100
    entitylist = {}
#     print(j)
    payload = """{"sentences":[""" + '"' + doc + '"' + "]}"
#     print(payload)
    r = requests.post('http://fastner.sage:3020/entities/mitie', data = payload)
    entities = r.json()['entities'][0]
    entity_indices = [entities[i]['r'] for i in range(len(entities))]
#     print entity_indices

    for index in entity_indices:
        entity = " ".join(doc.split()[index[0]:index[1]])
        if entity in entitylist.keys():
            entitylist[entity] += 1
        else:
            entitylist[entity] = 1

#     print entitylist.items()
    sortedEntityList = sorted(entitylist.items(), key=operator.itemgetter(1), reverse=True)
    
    # Feature hashed vector hot for top 5 entities by frequency of occurrence in each document
    for entity in sortedEntityList[:5]:  
        ent.append(str(entity[0]))
        m = hashlib.md5(str(entity[0]))
        hashed_entity = int(m.hexdigest(),16)%100
        doc_vec[hashed_entity] = doc_vec[hashed_entity]+1
    
#     print(ent)
#     print(len(entitylist))
    d[names[j]]=ent
    doc_matrix.append(doc_vec)
    j=j+1
    
print( json.dumps( dict(entities=d) ) )

In [None]:
import csv
with open('hashed_docs.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(vec for vec in doc_matrix)
    

### Testing Treebank Tokenizer

In [None]:
# import nltk
# from nltk.tokenize import TreebankWordTokenizer
# tokens = []
# tokens = TreebankWordTokenizer().tokenize("Frank Cuban is my name. What 'Frank Cuban' is this witchcraft David? Cheetah is my favorite animal. David, how are you! I have been thinking.")
# print tokens
# print(" ".join(tokens))

### Experimenting with tfidf vectors for the document, didn't finish the implementation

In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95,stop_words='english')
tfidf.fit_transform(doc_content)
tfidf_doc2topic = tfidf.transform(doc_content, copy=True)

In [None]:
print (tfidf_doc2topic.shape)
import numpy as np
tfidf_matrix = np.zeros((tfidf_doc2topic.shape[0],tfidf_doc2topic.shape[1]))
tfidf_matrix = tfidf_doc2topic
print (tfidf_matrix[0].toarray())

In [None]:
import csv
orig = [[]]
with open('tfidf_docs.csv', 'wb') as f:
    writer = csv.writer(f)
    for i in range((tfidf_matrix).shape[0]):
        orig.append(tfidf_matrix[i].toarray())
    writer.writerows(orig)
    

## Lexis Nexis Work beyond this point

In [None]:
import requests
import json
import hashlib
import operator
from json import dumps
from collections import OrderedDict
from xml.etree.ElementTree import fromstring
from xmljson import BadgerFish
bf = BadgerFish(dict_type=OrderedDict)

api_key = '0fc921fa3a1f4653988216cdf9483534'
headers = {'Authorization': 'Basic SGFuZHNoYWtlczpzb3VyY2VzNDE4Mg==',
           'content-type':'application/json',
           'Accept':'application/json'}
#            'WWW-Authenticate': 'Basic realm=”Rules API”'}

payload = {'key': api_key}
data = [{"name": "query_us_uk", "query": "sourceCountryCode:us OR sourceCountryCode:can", "active": "true"},
{"name": "query_gr","query": "sourceCountryCode:can","active": "false"}]
rule = requests.post("https://portal.moreover.com/portal-rest/v1/rules/save",  json=data, params = payload, headers = headers)
print rule.text

rules_list = requests.get("https://portal.moreover.com/portal-rest/v1/rules/all", params=payload, headers = headers)
json_response = rules_list.json()

for query in json_response["queries"]:    
    sequenceid = query["id"]
    payload = {'key': api_key, 'sequenceid': sequenceid}
    article = requests.get("http://metabase.moreover.com/api/v10/articles", params = payload, headers = headers)
    json_article = dumps(bf.data(fromstring(article.text)))
    print json_article
    
