In [None]:
import stanza
import pandas as pd
from ast import literal_eval

In [None]:
nlp = stanza.Pipeline(lang="en", processors="tokenize, lemma")



2021-06-23 21:51:07 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| lemma     | combined |

2021-06-23 21:51:07 INFO: Use device: cpu
2021-06-23 21:51:07 INFO: Loading: tokenize
2021-06-23 21:51:07 INFO: Loading: lemma
2021-06-23 21:51:07 INFO: Done loading processors!


In [None]:
# drop about 2500 NaN values.
text_2019 = pd.read_excel("10K_2019.xlsx",index_col=0).dropna().reset_index(drop=True)

In [None]:
text_2019

Unnamed: 0,path,para_keywords
0,D:\10-K\10-K\19\0000001800\0001047469-19-00062...,"2013\nto 2015 — President, Early Technologies,..."
1,D:\10-K\10-K\19\0000002186\0001654954-19-00203...,The\noccurrence of one or more natural disaste...
2,D:\10-K\10-K\19\0000002969\0000002969-19-00005...,Our operations could be impacted by catastroph...
3,D:\10-K\10-K\19\0000004281\0000004281-19-00003...,kidnapping of personnel; major public health i...
4,D:\10-K\10-K\19\0000006281\0000006281-19-00014...,•natural disasters or pandemics;\n------------...
...,...,...
617,D:\10-K\10-K\19\0001742927\0001144204-19-02777...,"The occurrence of natural disasters, including..."
618,D:\10-K\10-K\19\0001749704\0001558370-19-00169...,“patient census”) and the services those patie...
619,D:\10-K\10-K\19\0001750019\0001750019-19-00000...,"Our operations, and those of our CROs, CMOs an..."
620,D:\10-K\10-K\19\0001758488\0001193125-19-13744...,Other unscheduled withdrawals of ships from se...


In [None]:
at_least_num_words = 10
#tokenize paragraph to sentences
def sentences_tokenize(paragraph,nlp):
    doc = nlp(paragraph)
    return doc.sentences

def preprocess_paragraph(p):
    return p.replace("\n"," ")

# split original text into list of paragraphs
def get_paragraphs_list(ps):
    return [p for p in preprocess_paragraph(ps).split("---------------------------") if p!="\n"]

# tokenize sentence and lemma it
def get_all_sentences(paras):
    sentences = []
    nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma')
    
    for idx, ps in enumerate(paras):
        print(f"\r{round((idx + 1) / paras.shape[0] * 100, 4)}%", end="", flush=True )
        for p in get_paragraphs_list(ps):
            sts = sentences_tokenize(p, nlp)

            for s in sts:
                if len(s.words) > at_least_num_words:
                    sentences.append(s)
            
    return sentences


In [None]:
sentences = get_all_sentences(text_2019["para_keywords"])

2021-06-23 22:30:46 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| lemma     | combined |

2021-06-23 22:30:46 INFO: Use device: cpu
2021-06-23 22:30:46 INFO: Loading: tokenize
2021-06-23 22:30:46 INFO: Loading: lemma
2021-06-23 22:30:46 INFO: Done loading processors!


100.0%2%

In [None]:
lemma_sentences = []

for sentence in sentences:
    tmp_sentence = []
    for word in sentence.words:
        tmp_sentence.append(word.lemma)
        
    lemma_sentences.append(tmp_sentence)

#  Train word2vec model

In [None]:
from gensim.models import Word2Vec
import numpy as np

In [None]:
model = Word2Vec(lemma_sentences, min_count=1, vector_size=100, window=5)
def transform_sentences_to_vector(sentences, model):
    
    vectors = []
    vector_size = model.vector_size
    
    for s in sentences:
        v = np.zeros(vector_size)
        for w in s:
            v += model.wv[w]
        vectors.append(v / len(s))
    
    return vectors

data = transform_sentences_to_vector(lemma_sentences, model)

# Kmean

In [None]:
from nltk.cluster import KMeansClusterer, cosine_distance

def group_sentence_by_kmean(lemma_sentences, data, num_clusters=8, repeats=5):
    
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=repeats)

    # samples are assigned to cluster labels 
    # starting from 0
    clusters = clusterer.cluster(data, assign_clusters=True)
    
    results = {i: [] for i in set(clusters)}
    
    for idx, c in enumerate(clusters):
        results[c].append(idx)
    
    return clusterer, results

clusterer, groups = group_sentence_by_kmean(lemma_sentences, data)

def distance(p1, p2):
    return cosine_distance(p1, p2)

def top_sentences_centroid(centroids, groups, data, lemma_sentences, topn=20):
    top_sentences = []
    
    for cid, idxs in groups.items():
        pair_dists = []
        
        for idx in idxs:
            pair_dists.append(distance(centroids[cid], data[idx]))

        print(f"Cluster {cid}:")
        print("----------------------------")
        
        for i in np.array(pair_dists).argsort()[-topn:]:
            print(" ".join(lemma_sentences[idxs[i]]))
            print("----------------------------")
        print()

In [None]:
top_sentences_centroid(clusterer.means(), groups, data, lemma_sentences)

Cluster 0:
----------------------------
significant limitation on the company 's ability to manufacture product due to disruption of manufacturing operation or related infrastructure could have a material adverse effect on the company 's sale revenue , cost , result of operation , credit rating , and financial condition .
----------------------------
unfavorable global economic condition could adversely affect we business , financial condition , or result of operation .
----------------------------
the potential effect of this condition could have a material adverse effect on we business , result of operation and financial condition .
----------------------------
the effect of global , regional , and local weather condition , and climate change could also adversely impact we result of operation .
----------------------------
we business could be adversely affect by the effect of a public health epidemic .
----------------------------
such closures may disrupt we business operation and 

# GMM

In [None]:
from sklearn import mixture

def group_sentence_by_GMM(lemma_sentences, data, num_clusters=8, repeats=5):
    
    gmm = mixture.GaussianMixture(num_clusters,
                              covariance_type="spherical", random_state=42)

    # samples are assigned to cluster labels 
    # starting from 0
    clusters = gmm.fit_predict(data)
    
    results = {i: [] for i in set(clusters)}
    
    for idx, c in enumerate(clusters):
        results[c].append(idx)
    
    return gmm, results



def distance(p1, p2):
    return cosine_distance(p1, p2)

def top_sentences_centroid(centroids, groups, data, lemma_sentences, topn=20):
    top_sentences = []
    
    for cid, idxs in groups.items():
        pair_dists = []
        
        for idx in idxs:
            pair_dists.append(distance(centroids[cid], data[idx]))

        print(f"Cluster {cid}:")
        print("----------------------------")
        
        for i in np.array(pair_dists).argsort()[-topn:]:
            print(" ".join(lemma_sentences[idxs[i]]))
            print("--------------------------------")
        print()

In [None]:
clusterer, groups = group_sentence_by_GMM(lemma_sentences, data)
top_sentences_centroid(clusterer.means_, groups, data, lemma_sentences)

Cluster 0:
----------------------------
some carrier have also park or retire a portion of they fleets and have reduce workforces and flight .
--------------------------------
the guideline do not specifically address the use of buprenorphine for chronic pain or make treatment recommendation about the use of abuse - deterrent opioids .
--------------------------------
depend on the final term of Brexit , we could face new regulatory cost and challenge and great volatility in the pound sterling and the Euro .
--------------------------------
covidien and the IRS have conclude and reach agreement on its audit of covidien 's U.S. federal income tax return for all tax year through 2012 .
--------------------------------
lps or endotoxin translocation be think to be a primary cause of downstream signal in the liver cause inflammation and damage .
--------------------------------
the company determine the fair value of its supply agreement liability and above market supply agreement liabilit

# LDA

In [None]:
sentences = []
for s in lemma_sentences:
    sentences.append(" ".join(s))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def group_sentence_by_lda(sentences, num_clusters=8):
    tf_vectorizer = CountVectorizer(min_df=5, stop_words = "english")
    tf = tf_vectorizer.fit_transform(sentences)
    
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    lda = LatentDirichletAllocation(num_clusters, max_iter=30, verbose=1, evaluate_every=1, n_jobs=1, random_state=0)
    
    doc_distribution = lda.fit_transform(tf)
    clusters = doc_distribution.argsort()[:, -1]
    
    results = {i: [] for i in set(clusters)}
    
    for idx, c in enumerate(clusters):
        results[c].append((idx, doc_distribution[idx][c]))
    

    return results

def print_cluster_topn(results, sentences, cluster_number, topn=20):
    topn_sentences = sorted(results[cluster_number], key=lambda x: -x[1])[:topn]
    print(f"Cluster {cluster_number}:\n-------------")
    for idx, score in topn_sentences:
        print(idx, sentences[idx])
        print("---------------------")

In [None]:
results = group_sentence_by_lda(sentences)

In [None]:
print_cluster_topn(results, sentences, 0, 20)

Cluster 0:
-------------
3065 the federal physician payments Sunshine Act , enact under the ppaca , which require certain manufacturer of drug , device , biologics , and medical supplies for which payment be available under medicare , medicaid , or the Children 's Health insurance Program , with specific exception , to make annual report to the centers for medicare & medicaid service , or cms , regard any '' transfer of value '' provide to physician and teach hospital .
---------------------
3057 Failure to submit timely , accurately , and completely the require information for all payment , transfer of value and ownership or investment interest may result in civil monetary penalty of up to a aggregate of $ 150000 per year and up to a additional aggregate of $ 1 million per year for '' know failure , '' for all payment , transfer of value or ownership or investment interest that be not timely , accurately , and completely report in a annual submission .
---------------------
3062 Failu