In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords,wordnet
from nltk import word_tokenize,pos_tag
import numpy as np
import time
import re
import pickle as pkl
import spacy

# Tagging system

In [44]:
def initialize_variables():
    idfs = pkl.load(open('idfs_dict_lemm.pkl','rb'))
    vocab = list(idfs.keys())
    len_vocab = len(vocab)
    word_to_int = dict(zip(vocab,range(len_vocab)))
    idfs_array = np.zeros(len_vocab)
    for word in vocab: idfs_array[word_to_int[word]] = idfs[word]
    stopw = stopwords.words('english')
    lemmer = WordNetLemmatizer()
    nlp = spacy.load('en_core_web_sm')
    return vocab,len_vocab,word_to_int,idfs_array,stopw,lemmer,nlp

def pt_to_wn(tag):
    if tag.startswith('J'): return wordnet.ADJ
    if tag.startswith('V'): return wordnet.VERB
    if tag.startswith('N'): return wordnet.NOUN
    if tag.startswith('R'): return wordnet.ADV

def remove_unwanted(tags):
    remove = list()
    days = ['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
    day_of_the_month_re = '[1-9][1-9]?th'
    time_re = '[0-1]?[0-9]:[0-5][0-9]([a,p]m)?'
    belonging_re = '\.\'s'
    for i,tag in enumerate(tags): 
        match = re.search(belonging_re,tag)
        if match:
            start,end = match.span()
            tags[i] = tag[:start]+tag[end:]
        for bit in tag.split('.'):
            match = re.search(day_of_the_month_re,bit)
            if match: 
                remove.append(tag)
                break
            match = re.search(time_re,bit)
            if match: remove.append(tag)
    for tag in remove: tags.remove(tag)
    return [tag for tag in tags if tag not in days and len(tag)>1]

def get_clean_text_raw(raw_text,stopw,lemmer):
    text = word_tokenize(raw_text)
    postags = pos_tag(text)
    postags = [(word,tag) for word,tag in postags if tag in ['NN','JJ','JJR','NNS','VB','VBD','VBG','VBN','VBP','VBZ']]
    text = [lemmer.lemmatize(word.lower(),pt_to_wn(tag)) for word,tag in postags if word.lower() not in stopw and word.isalnum() and not word.isdigit()]
    postags = [tag for word,tag in postags if word.lower() not in stopw and word.isalnum() and not word.isdigit()]
    return text,postags

def get_TFs_raw(raw_text,len_vocab,word_to_int,stopw,lemmer):
    words,postags = get_clean_text_raw(raw_text,stopw,lemmer)
    word_counts = dict(zip(*np.unique(words,return_counts=True)))
    n_words = sum(list(word_counts.values()))
    tfs = np.zeros(len_vocab)
    unknowns = list()
    for i,word in enumerate(words): 
        try: tfs[word_to_int[word]] = word_counts[word]/n_words
        except KeyError: unknowns.append((word,postags[i]))
    return tfs,unknowns

def get_significants(raw_text,vocab,idfs_array,len_vocab,word_to_int,stopw,lemmer):
    tfs,unknowns = get_TFs_raw(raw_text,len_vocab,word_to_int,stopw,lemmer)
    tfidfs = tfs*idfs_array
    order = np.flip(np.argsort(tfidfs))
    significants = np.array(vocab)[order]
    return list(significants),list(tfidfs[order]),unknowns

def get_sentences(raw_text,punctuation=False):
    raw_text = raw_text.replace('-','')
    sentences = list()
    sentence = ''
    for character in raw_text:
        if character in ['.','!','?']:
            sentences.append(sentence)
            sentence = ''
        else: sentence+=character
    for i,sentence in enumerate(sentences):
        if punctuation: sentences[i] = [word for word in word_tokenize(sentence)]
        else: sentences[i] = [word for word in word_tokenize(sentence) if word.isalnum() and not word.isdigit()]
    return sentences

def common_words(entity1,entity2):
    words1 = entity1.split()
    words2 = entity2.split()
    for word in words1:
        if word in words2: return True
    return False

def process_ppl_org(entities,len_text):
    counts = dict()
    cleaned_entities = list()
    for entity in entities: 
        add = True
        if entity.label_=='ORG':
            cleaned_entities.append(entity.text)
            continue
        if entity.label_=='PERSON': 
            for r in counts: 
                if common_words(r,entity.text): 
                    counts[r]+=1
                    add=False
                    if len(entity.text.split())>len(r.split()): 
                        counts[entity.text] = counts[r]
                        del counts[r]
                    break
            if add: counts[entity.text] = 1
    if len(counts)>0:
        average_appearance = np.mean(list(counts.values()))
        all_words = list()
        for entity,count in counts.items():
            if count>=average_appearance: 
                cleaned_entities.append(entity)
    for i,entity in enumerate(cleaned_entities): 
        cleaned_entities[i] = ' '.join([word for word in entity.split() if word.isalnum()])
    return cleaned_entities

def process_places(places,len_text,stopw,lemmer):
    keeps = list()
    average_count = np.mean(list(places.values()))
    for place,counts in places.items(): 
        if counts>=average_count: 
            keeps.append('.'.join([word.lower() for word in place.split() if word.lower() not in stopw]))
    return keeps

def get_spacy_entities(raw_text,stopw,nlp,lemmer):
    sentences = get_sentences(raw_text,punctuation=True)
    len_text = sum([len(sentence) for sentence in sentences])
    entities = list()
    places = dict()
    for sentence in sentences: 
        sentence = ' '.join(sentence)
        doc = nlp(sentence)
        for entity in doc.ents:
            if entity.label_ in ['PERSON','ORG'] and len(entity.text.split())<=3: entities.append(entity)
            if entity.label_=='GPE': 
                try: places[entity.text] += 1
                except KeyError: places[entity.text] = 1
    entities = process_ppl_org(entities,len_text)
    for i,entity in enumerate(entities):
        entities[i] = '.'.join([word.lower() for word in entity.split() if word.lower() not in stopw])
    places = process_places(places,len_text,stopw,lemmer)
    return entities+places

def process_unknowns(unknowns):
    porter = PorterStemmer()
    keeps = list()
    stems = dict()
    for word,tag in unknowns:
        stem = porter.stem(word)
        try: stems[stem].append((word,tag))
        except KeyError: stems[stem] = [(word,tag)]
    for stem,appearances in stems.items():
        if len(appearances)>1:
            done = False
            for word,tag in appearances:
                if tag=='NN': 
                    keeps.append(word)
                    done = True
                    break
            if done: continue
            for word,tag in appearances:
                if tag.startswith('N'): 
                    keeps.append(word)
                    done = True
                    break
            if done: continue
            for word,tag in appearances:
                if tag=='JJ': 
                    keeps.append(word)
                    done = True
                    break
            if done: continue
            for word,tag in appearances:
                if tag.startswith('J'): 
                    keeps.append(word)
                    done = True
                    break
            if done: continue
            for word,tag in appearances: 
                if tag.startswith('V'): 
                    keeps.append(word)
                    break
    return keeps

def get_spacy_significants(raw_text,lemmer,vocab,idfs_array,len_vocab,word_to_int,stopw,nlp):
    entities = get_spacy_entities(raw_text,stopw,nlp,lemmer)
    doubles = list()
    for entity in entities: doubles += entity.split('.')
    doubles = [tag for tag in doubles]
    significants,tfidfs,unknowns = get_significants(raw_text,vocab,idfs_array,len_vocab,word_to_int,stopw,lemmer)
    tags = list()
    for i,s in enumerate(significants): 
        if tfidfs[i]<.0005: break
        if s not in doubles: tags.append(s)
    tags += process_unknowns(unknowns)
    tags += entities
    tags = remove_unwanted(tags)
    return list(set(tags))

def get_tags(raw_text):
    vocab,len_vocab,word_to_int,idfs_array,stopw,lemmer,nlp = initialize_variables()
    return get_spacy_significants(raw_text,lemmer,vocab,idfs_array,len_vocab,word_to_int,stopw,nlp)

# Tests

In [50]:
text = """
Is it happening? Is it not happening? After years of back and forth, it looks like the new tax on tech giants in France is about to become a law. Big tech companies that generate significant revenue in France will be taxed on their revenue generated in France.

France’s Economy Minister Bruno Le Maire has been lobbying for a new tax so that tech giants would stop optimizing their European corporate structure to lower their effective tax rate. Originally, Le Maire wanted to convince other European countries to get on board.

But you need a unanimous vote when it comes to tax reforms in Europe. And Le Maire couldn’t convince everyone.

Le Maire still wanted to do something. So here we are, with a new tax on tech companies that generate over €750 million in revenue globally and €25 million in France.

If you’re operating a marketplace (Amazon’s marketplace, Uber, Airbnb…) or an advertising business (Facebook, Google, Criteo…), you will have to pay 3 percent of your French revenue in taxes. The government says that it isn’t against American companies as European and Asian companies are also about to get taxed.

It’s a weird taxation model as it is based on revenue and not profit. It’ll also require some work from the taxation administration as French revenue means that it involves all transactions with somebody with a French mailing address or a French IP address. France expects to generate €400 million in revenue with this new tax in 2019.

"""
start = time.time()
tags = get_tags(text)
stop = time.time()
print('#'+'\n#'.join(tags))
print('\ntime:',stop-start)

#facebook
#google
#france
#bruno.le.maire
#optimize
#amazon
#french
#le.maire
#unanimous
#marketplace
#taxation
#american
#weird
#mailing
#european
#uber
#tech

time: 0.8228862285614014
