## Use stanze to break paragragh into sentence and lemmatize each sentence

In [3]:
import stanza
import pandas as pd
from ast import literal_eval

In [2]:
pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/3b/a0/baecd55c7f1108a4b2459f46dde7aa765e393c735662861fe94ba7bc4ba3/stanza-1.2.2-py3-none-any.whl (337kB)
[K     |█                               | 10kB 13.9MB/s eta 0:00:01[K     |██                              | 20kB 16.4MB/s eta 0:00:01[K     |███                             | 30kB 15.5MB/s eta 0:00:01[K     |███▉                            | 40kB 11.2MB/s eta 0:00:01[K     |████▉                           | 51kB 5.6MB/s eta 0:00:01[K     |█████▉                          | 61kB 5.7MB/s eta 0:00:01[K     |██████▉                         | 71kB 6.2MB/s eta 0:00:01[K     |███████▊                        | 81kB 6.2MB/s eta 0:00:01[K     |████████▊                       | 92kB 6.3MB/s eta 0:00:01[K     |█████████▊                      | 102kB 6.5MB/s eta 0:00:01[K     |██████████▊                     | 112kB 6.5MB/s eta 0:00:01[K     |███████████▋                    | 122kB 6.5MB/s eta 0

In [4]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma')


ResourcesFileNotFoundError: ignored

In [None]:
text_2020 = pd.read_excel("10K_2020.xlsx",index_col=0).dropna().reset_index(drop=True)

In [None]:
text_2020

Unnamed: 0,path,para_keywords
0,D:\10-K\10-K\20\0000001750\0001104659-20-08531...,"Upon entering the fourth quarter in March, we ..."
1,D:\10-K\10-K\20\0000001800\0001104659-20-02390...,"2013 to 2015 — President, Early Technologies, ..."
2,D:\10-K\10-K\20\0000001961\0001264931-20-00005...,"After all of our operations were spun off,\nou..."
3,D:\10-K\10-K\20\0000002098\0001564590-20-01085...,"If a pandemic, epidemic or outbreak of an infe..."
4,D:\10-K\10-K\20\0000002186\0001654954-20-00224...,"In\nDecember 2019, a strain of the coronavirus..."
...,...,...
2447,D:\10-K\10-K\20\0001795250\0001795250-20-00001...,Impact of the COVID-19 Pandemic on Our Busines...
2448,D:\10-K\10-K\20\0001796129\0001564590-20-04438...,Management continues to evaluate the impact of...
2449,D:\10-K\10-K\20\0001803284\0001820271-20-00000...,Our business could be adversely affected by th...
2450,D:\10-K\10-K\20\0001804585\0001607062-20-00030...,Based\non the preliminary results from the Pha...


In [None]:
at_least_num_words = 10
# tokenize paragraph to sentences
def sentences_tokenize(paragraph, nlp): 
    doc = nlp(paragraph)
    return doc.sentences

def preprocess_paragraph(p):
    return p.replace("\n", " ")

# split original text into list of paragraphs (paragraphs are separated by "-------...")
def get_paragraphs_list(ps):
    return [p for p in preprocess_paragraph(ps).split("---------------------------") if p != "\n"]

# tokenize sentence and lemma it
def get_all_sentences(paras):
    sentences = []
    nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma')
    
    for idx, ps in enumerate(paras):
        print(f"\r{round((idx + 1) / paras.shape[0] * 100, 4)}%", end="", flush=True )
        for p in get_paragraphs_list(ps):
            sts = sentences_tokenize(p, nlp)

            for s in sts:
                if len(s.words) > at_least_num_words:
                    sentences.append(s)
            
    return sentences

In [None]:
sentences = get_all_sentences(text_2020["para_keywords"])

2021-06-23 21:55:24 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| lemma     | combined |

2021-06-23 21:55:24 INFO: Use device: cpu
2021-06-23 21:55:24 INFO: Loading: tokenize
2021-06-23 21:55:24 INFO: Loading: lemma
2021-06-23 21:55:24 INFO: Done loading processors!


100.0%2%

In [None]:
lemma_sentences = []

for sentence in sentences:
    tmp_sentence = []
    for word in sentence.words:
        tmp_sentence.append(word.lemma)
        
    lemma_sentences.append(tmp_sentence)

# Train word2vec model

In [None]:
from gensim.models import Word2Vec
import numpy as np

In [None]:
model = Word2Vec(lemma_sentences, min_count=1, vector_size=100, window=5)

In [None]:
def transform_sentences_to_vector(sentences, model):
    
    vectors = []
    vector_size = model.vector_size
    
    for s in sentences:
        v = np.zeros(vector_size)
        for w in s:
            v += model.wv[w]
        vectors.append(v / len(s))
    
    return vectors
    

In [None]:
data = transform_sentences_to_vector(lemma_sentences, model)

# KMean

In [None]:
from nltk.cluster import KMeansClusterer, cosine_distance

In [None]:
def group_sentence_by_kmean(lemma_sentences, data, num_clusters=8, repeats=5):
    
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=repeats)

    # samples are assigned to cluster labels 
    # starting from 0
    clusters = clusterer.cluster(data, assign_clusters=True)
    
    results = {i: [] for i in set(clusters)}
    
    for idx, c in enumerate(clusters):
        results[c].append(idx)
    
    return clusterer, results

In [None]:
clusterer, groups = group_sentence_by_kmean(lemma_sentences, data)

In [None]:
def distance(p1, p2):
    return cosine_distance(p1, p2)

def top_sentences_centroid(centroids, groups, data, lemma_sentences, topn=20):
    top_sentences = []
    
    for cid, idxs in groups.items():
        pair_dists = []
        
        for idx in idxs:
            pair_dists.append(distance(centroids[cid], data[idx]))

        print(f"Cluster {cid}:")
        print("----------------------------")
        
        for i in np.array(pair_dists).argsort()[-topn:]:
            print(" ".join(lemma_sentences[idxs[i]]))
            print("--------------------------------")
        print()
        


In [None]:
results = top_sentences_centroid(clusterer.means(), groups, data, lemma_sentences)

Cluster 0:
----------------------------
we cyber - security could be compromised if person who be force to work from home do not maintain adequate information security .
--------------------------------
some of we product be manufacture at a single manufacturing facility or store at a single storage site .
--------------------------------
for example , we install physical barrier between employee in production facility , implement extensive clean and sanitation process for both production and office space , and implement broad work - from-home initiative for office personnel .
--------------------------------
most of this facility have be reopen to vary degree .
--------------------------------
this situation be change rapidly , and additional impact may arise that we be not aware of currently .
--------------------------------
we ability to consistently generate cash flow from operate activity be one of we most significant financial strength .
--------------------------------
if deman

# GMM

In [None]:
from sklearn import mixture

def group_sentence_by_GMM(lemma_sentences, data, num_clusters=8, repeats=5):
    
    gmm = mixture.GaussianMixture(num_clusters,
                              covariance_type="spherical", random_state=42)

    # samples are assigned to cluster labels 
    # starting from 0
    clusters = gmm.fit_predict(data)
    
    results = {i: [] for i in set(clusters)}
    
    for idx, c in enumerate(clusters):
        results[c].append(idx)
    
    return gmm, results



def distance(p1, p2):
    return cosine_distance(p1, p2)

def top_sentences_centroid(centroids, groups, data, lemma_sentences, topn=20):
    top_sentences = []
    
    for cid, idxs in groups.items():
        pair_dists = []
        
        for idx in idxs:
            pair_dists.append(distance(centroids[cid], data[idx]))

        print(f"Cluster {cid}:")
        print("----------------------------")
        
        for i in np.array(pair_dists).argsort()[-topn:]:
            print(" ".join(lemma_sentences[idxs[i]]))
            print("--------------------------------")
        print()

In [None]:
clusterer, groups = group_sentence_by_GMM(lemma_sentences, data)
top_sentences_centroid(clusterer.means_, groups, data, lemma_sentences)

Cluster 0:
----------------------------
this may adversely affect we result of operation , financial position and cash flow .
--------------------------------
we expect to continue to see reduce demand in we non-cargo commercial business .
--------------------------------
we also maintain a inventory of certain product that we anticipate will be in great demand .
--------------------------------
we ability to consistently generate cash flow from operate activity be one of we most significant financial strength .
--------------------------------
if the company do not effectively respond to the demand of its customer , they could decrease they purchase from the company , cause the company 's net sale and net earnings to decline .
--------------------------------
unfavorable global or regional economic condition could adversely affect we business and financial result .
--------------------------------
the company have observe a decline in customer demand for its service and expect this de

# LDA

In [None]:
sentences = []
for s in lemma_sentences:
    sentences.append(" ".join(s))

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def group_sentence_by_lda(sentences, num_clusters=8):
    tf_vectorizer = CountVectorizer(min_df=5, stop_words = "english")
    tf = tf_vectorizer.fit_transform(sentences)
    
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    lda = LatentDirichletAllocation(num_clusters, max_iter=30, verbose=1, evaluate_every=1, n_jobs=1, random_state=0)
    
    doc_distribution = lda.fit_transform(tf)
    clusters = doc_distribution.argsort()[:, -1]
    
    results = {i: [] for i in set(clusters)}
    
    for idx, c in enumerate(clusters):
        results[c].append((idx, doc_distribution[idx][c]))
    

    return results

def print_cluster_topn(results, sentences, cluster_number, topn=20):
    topn_sentences = sorted(results[cluster_number], key=lambda x: -x[1])[:topn]
    print(f"Cluster {cluster_number}:\n-------------")
    for idx, score in topn_sentences:
        print(idx, sentences[idx])
        print("---------------------")

In [None]:
results = group_sentence_by_lda(sentences)

iteration: 1 of max_iter: 30, perplexity: 1069.9717
iteration: 2 of max_iter: 30, perplexity: 905.9398
iteration: 3 of max_iter: 30, perplexity: 829.8457
iteration: 4 of max_iter: 30, perplexity: 788.7253
iteration: 5 of max_iter: 30, perplexity: 763.9482
iteration: 6 of max_iter: 30, perplexity: 749.5778
iteration: 7 of max_iter: 30, perplexity: 740.7231
iteration: 8 of max_iter: 30, perplexity: 734.7749
iteration: 9 of max_iter: 30, perplexity: 730.4759
iteration: 10 of max_iter: 30, perplexity: 727.3043
iteration: 11 of max_iter: 30, perplexity: 724.8657
iteration: 12 of max_iter: 30, perplexity: 722.9127
iteration: 13 of max_iter: 30, perplexity: 721.3494
iteration: 14 of max_iter: 30, perplexity: 719.9954
iteration: 15 of max_iter: 30, perplexity: 718.7817
iteration: 16 of max_iter: 30, perplexity: 717.6996
iteration: 17 of max_iter: 30, perplexity: 716.8086
iteration: 18 of max_iter: 30, perplexity: 716.0449
iteration: 19 of max_iter: 30, perplexity: 715.4217
iteration: 20 of max

In [None]:
print_cluster_topn(results, sentences, 0, 20)

Cluster 0:
-------------
24575 the $ 28.6 million , or 8.8 % , increase in net sale in 2020 as compare to 2019 be mainly due to a $ 27.6 million increase in net sale from Globe ( acquire in July 2019 ) , a $ 13.5 million increase in net sale at vacco due to high shipments of space product , partially offset by a $ 6.1 million decrease in net sale at mayday , a $ 3.7 million decrease in net sale at crissair and a $ 3.0 million decrease in net sale at pti all drive by the covid - 19 pandemic in the current year .
---------------------
4818 the decrease in the fsg 's operate income as a percentage of net sale reflect the previously mention low gross profit margin and a .5 % increase in sg&a expense as a percentage of net sale mainly from the previously mention high bad debt expense and fix cost efficiencies lose result from the pandemic 's impact , partially offset by the previously mention low performance - base compensation expense .
---------------------
68319 although we experience gr