In [None]:
import pandas as pd
import numpy as np
import re
import math
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import gensim
from nltk.tokenize import word_tokenize
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from pprint import pprint

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Basicly Get Same Data and Processed Like Roger and Crystal

## Google News

In [None]:
keywords_all = pd.read_csv('Keywords - Manual Group.csv')
keywords_all.head()

In [None]:
keywords = keywords_all.loc[keywords_all['Category'].isin(['Rule', 'Government', 'violation']), ['Category', 'Keywords']]

### Read the News Data Frame

In [None]:
# Not compliance articles
google_news_finance = pd.read_excel('google_news_finance_1021.xlsx')
google_news_finance['text'] = google_news_finance['title'] + '. ' + google_news_finance['summary']

# Compliance articles
google_news = pd.read_csv('google_news_1021.csv')
google_news['text'] = google_news['title'] + '. ' + google_news['summary']

df = pd.read_excel('NewsPaper_df.xlsx')
df['text'] = df['Title'] + '. ' + df['Summary']

In [None]:
google_news_finance.keyword.unique()
df.Source.unique()

array(['SEC', 'WSJ', 'The FCPA Blog', 'Corporate Compliance Insights',
       'Compliance and Enforcement', 'Financial Services Perspectives'],
      dtype=object)

### Preprocessing

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(paper):
  paper = paper.lower()
  paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
  paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
  paper_tokens = [token for token in paper_tokens if len(token) > 1]
  paper_tokens = [token for token in paper_tokens if token not in stop_words]
  paper_tokens = list(filter(None, paper_tokens))
  return paper_tokens

In [None]:
compliance_news = pd.concat([google_news['text'], df['text']], ignore_index=True).dropna()
non_compliance_news = google_news_finance['text']

# Simple TF-IDF on Compliance news without Identify Compliance vs non-Compliance

### Creating Vocabulary and Word Counts to conduct TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

We applied sklearn tokenizer to tokenize and lemmarize. After unified the word format, we applied TF-IDF vectorizer to project text to word vectors, so the whole text is converted into a data frame with each content word as a feature and TF-IDF score as the values.

In [None]:
# Applying TFIDF
tfidf_vectors = TfidfVectorizer(tokenizer = lambda i:i,ngram_range = (1,3),
                                preprocessor=normalize_corpus,
                                max_df=0.85, lowercase=False) 
tfidf = tfidf_vectors.fit_transform(compliance_news)
features = tfidf_vectors.get_feature_names()

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results

In [None]:
sorted_items=sort_coo(tfidf.tocoo())
keywords=extract_topn_from_vector(features,sorted_items,50)

### Sort the keyword and pull result out

In [None]:
sorted_keywords = sorted(keywords.items(), key=lambda kv: kv[1])
import collections
collections.OrderedDict(sorted_keywords)

OrderedDict([('bankruptcy', 0.392),
             ('udf', 0.393),
             ('chicken', 0.395),
             ('fund', 0.396),
             ('voter', 0.396),
             ('disparate impact', 0.397),
             ('disparate', 0.397),
             ('tender', 0.398),
             ('canada', 0.405),
             ('mcafee', 0.408),
             ('compliance date', 0.409),
             ('ransomware', 0.409),
             ('fair value', 0.41),
             ('retail sale', 0.41),
             ('delaware', 0.414),
             ('fair canada', 0.414),
             ('tier', 0.415),
             ('circuit breaker', 0.416),
             ('breaker', 0.416),
             ('ethereum', 0.417),
             ('commerzbank', 0.42),
             ('etf', 0.423),
             ('sbsds', 0.424),
             ('vendor', 0.426),
             ('campaign', 0.426),
             ('rating', 0.428),
             ('rbi', 0.429),
             ('13f', 0.434),
             ('midstream', 0.437),
             ('madden', 

# Name Entity, The Stanford NER jar file you can download here: https://stanfordnlp.github.io/CoreNLP/index.html#download

The NER includes the names of people and organizations, organization address and zip code. Entity recognition is to identify entities with specific meaning in text, which includes Entity class(person name, place name, organization name),Time class(date) and Digital class(phone number, zip code).

In [None]:
#!  pip install polyglo
#! pip install PyICU
#! pip install pycld2
#! pip install morfessor
! polyglot download embeddings2.en
! polyglot download ner2.en
from polyglot.text import Text
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import spacy
import re
import os

### Preprocessing

In [None]:
def preprocess(doc):
    # doc = doc.lower()
    bad_symbols = ["\'s", '.com', ':', "\'", '\"', '(', ')', '-']
    for removable in bad_symbols:
        doc = doc.replace(removable, '')
    return doc

def filter_numbers(text):
    return re.sub(r'\d+', '', text)

There are three methods of entity recognition:<br/>
1.   Linguistic grammar-based techniques : The Linguistic grammar-based techniques mainly based on grammar, and its application in the engineering implementation is to write a lot of regex, which can solve the recognition of time class and digital class named entities.
2.   Statistical models: At present, the statistical methods are mainly HMM and CRF models, which are also relatively mature at present.

3.   Deep learning models: The method of deep learning is the most popular way at present, especially the DL model of RNN series, which can absorb more text semantic information, and its effect is the best at present.

In [None]:
def stanford_parse(text):
    parent_dir = os.path.abspath('')
    st = StanfordNERTagger(parent_dir + '/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                           parent_dir + '/stanford-ner/stanford-ner-3.9.2.jar', encoding='utf-8')
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    obj = ''
    prev_tag = 'O'

    ner_dict = {'PERSON': set(), 'LOCATION': set(), 'ORGANIZATION': set()}
    for idx, (ne, net) in enumerate(classified_text):
        # print(f"ne is: {ne}")
        # print(f"net is: {net}")
        if net != prev_tag and prev_tag != 'O':
            if obj != '':
                ner_dict[prev_tag].add(filter_numbers(obj.lower()))

            obj = ''
        elif net == prev_tag and net != 'O':
            obj = obj + ' ' + ne

        elif net != 'O' and prev_tag == 'O':
            obj = ne

            if idx == len(classified_text) - 1:
                ner_dict[net].add(filter_numbers(obj.lower()))
                break

        prev_tag = net

    # print(f"ner_dict is: {ner_dict}")
    return ner_dict


def polyglot_parse(text):
    ents = Text(text).entities
    # print(f"Entities are: {ents}")
    m = {'I-PER': 'PERSON', 'I-LOC': 'LOCATION', 'I-ORG': 'ORGANIZATION'}
    ner_dict = {'PERSON': set(), 'LOCATION': set(), 'ORGANIZATION': set()}

    # print(f"raw ner_dict is: {ner_dict}")

    for e in ents:
        entity = ' '.join(e)
        ner_dict[m[e.tag]].add(filter_numbers(entity.lower()))

    # print(f"result ner_dict is: {ner_dict}")
    return ner_dict


def spacy_parse(text):
    nlp = spacy.load('en')
    sp = nlp(text)
    spacy_entities = []
    m = {'PERSON': 'PERSON', 'ORG': 'ORGANIZATION', 'GPE': 'LOCATION'}
    ner_dict = {'PERSON': set(), 'ORGANIZATION': set(), 'LOCATION': set()}
    for ent in sp.ents:
        if ent.label_ in m:
            ner_dict[m[ent.label_]].add(filter_numbers(ent.text.lower()))

    # print(f"result ner_dict is: {ner_dict}")
    return ner_dict


def symm_agree(w1, w2):
    return w1 in w2 or w2 in w1


def get_smallest(w1, w2, w3):
    if w1 in w2 and w1 in w3:
        return w1
    elif w2 in w1 and w2 in w3:
        return w2
    else:
        return w3


# Never selects the third ranked
def get_biased_smallest(w1, w2, w3):
    if w1 in w2 and w1 in w3:
        return w1

    elif w2 in w1 and w1 in w3:
        return w1

    elif w2 in w1 and w2 in w3:
        return w2
    else:
        if w1 in w2:
            return w1
        else:
            return w2

def get_named_entities(Doc):
    doc = preprocess(Doc)
    try:
        a = polyglot_parse(doc)
        b = stanford_parse(doc)
        c = spacy_parse(doc)
    except:
        return []

    persons = vote(a, b, c, 'PERSON')
    new_person = set()
    bad_person = set()
    for p in persons:
        if len(p.split(' ')) == 4:
            new_person.update([' '.join(p.split(' ')[:2]), ' '.join(p.split(' ')[2:])])
            bad_person.add(p)
        if len(p.split(' ')) == 1:
            bad_person.add(p)

    for p in bad_person:
        persons.remove(p)
    for p in new_person:
        persons.add(p)

    locations = vote(a, b, c, 'LOCATION')
    orgs = vote(a, b, c, 'ORGANIZATION')

    orgs = set([n for n in orgs if len(n.split(' ')) < 5])

    return list(map(lambda p: p.title(), persons)), list(map(lambda l: l.title(), locations)), list(map(lambda o: o.title(), orgs))

Here we adopted three NER models: **Stanford NER, polyglot and spacy entities.** <br/>
StanfordNER is a java implementation of NER (named entity recognizer), it can mark the sequence of words in the text, such as person name, company name, gene name or protein name. It comes with a well-designed feature extractor for NER and many options for defining the feature extractor. There are many good English named entity recognizers, especially for person name, organization name, place name(Locations).<br/>
Polyglot language detection relies on pycld2 and cld2, among which cld2 is a multilingual detection application developed by Google. The training corpus of polyglot entity recognition comes from Wikipedia (wiki). The trained model has not been installed for the first time, so it needs to download the corresponding model. Polyglot supports the identification of entity classes (person name, place name, organization name) in 40 languages.<br/>
Spacy includes a fast entity recognition model – “spacy entities”, which can recognize entity phrases in documents. There are many types of entities, such as
people, places, organizations, dates, numbers. You can access these entities through the ents property of the document.<br/>
**We created weighted function vote() that can pick the most appropriate NER result from this three algrothiums. After recognized the Person’s Name, sometimes there’s more than 1 person’s name compacted together in the “Person Name” dictionary. Then we split different person’s name by the name length**

In [None]:
def vote(poly, stanf, spac, ner_type):
    ner = set()

    # If poly is a subset of the others, use that

    for poly_p in poly[ner_type]:
        for stanf_p in stanf[ner_type]:
            for spac_p in spac[ner_type]:
                pst = symm_agree(poly_p, stanf_p)
                ss = symm_agree(stanf_p, spac_p)
                psp = symm_agree(poly_p, spac_p)

                if pst and ss and psp:
                    ner.add(get_biased_smallest(poly_p, stanf_p, spac_p))
                elif pst or ss or psp:
                    # Select what the actual named entity is.

                    if psp:
                        ner.add(poly_p)

                    elif pst:
                        if poly_p in stanf_p:
                            ner.add(poly_p)
                        else:
                            ner.add(stanf_p)
                    else:
                        if stanf_p in spac_p:
                            ner.add(stanf_p)
                        else:
                            ner.add(spac_p)

    # At the end, take away all entities that are substrings of other entities

    new_set = set()
    blacklist = ['foundation', 'company', 'inc', 'corp', 'business', 'l.l.c', 'corporation', 'incorporated']
    for ne1 in ner:
        good = True
        for ne2 in ner:
            if ne1 != ne2 and ne1 in ne2:
                good = False
                break

        if good and ne1 not in blacklist:
            new_set.add(ne1)

    return new_set


### Ignore Warnings

In [None]:
import warnings
warnings.warn("ignore")
People=[]
Location=[]
Orgnization=[]
for i in range(len(df['text'])):
  people,location,orgs = get_named_entities(df['text'][i])
  People.append(people)
  Location.append(location)
  Orgnization.append(orgs)

In [None]:
NER = pd.DataFrame([People,Location,Orgnization])

[['Firm', 'Releasethe Securities', 'Securities And Exchange Commission'],
 ['Releasethe Securities', 'Exchange Commission', 'Finra'],
 ['U.S . Securities', 'Commodity Futures Trading Commission', 'Cftc'],
 ['Marathon', 'Sec'],
 ['Pricewaterhousecoopers',
  'Pcaob Board',
  'Releasethe Securities',
  'Securities And Exchange Commission',
  'Technology'],
 ['Pcaob Board',
  'Releasethe Securities',
  'Securities And Exchange Commission',
  'Public Company'],
 ['Releasethe Securities', 'Exchange Commission'],
 ['Jbs', 'Sec', 'Pilgrim’S Pride Corporation', 'Ministerio Publico Federal'],
 ['Releasethe Securities',
  'Securities And Exchange Commission',
  'Disclosure Review Program',
  'Division Of Corporation Finance'],
 ['Division',
  'Disclosure Review Program',
  'Office Of Municipal Securities',
  'Nrsros',
  'Releasethe Securities',
  'Securities And Exchange Commission',
  'Office Of Credit Ratings'],
 ['Sae', 'Sec'],
 ['Timmons',
  'Releasethe Securities',
  'New',
  'Securities And