In [11]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.lancaster import LancasterStemmer
from collections import defaultdict

In [2]:
df = pd.read_pickle("text_df_v2.pkl")
df

Unnamed: 0,ticker,text
0,A,Both our domestic and international operation...
1,A,"months ended April 30, 2020 is dependent in p..."
2,A,", we did not identify any triggering events or..."
3,A,of our common stock under this program. On Ma...
4,A,The following discussion should be read in con...
...,...,...
155511,ZTS,The novel coronavirus (COVID-19) was identifie...
155512,ZTS,The COVID-19 pandemic also may reduce demand f...
155513,ZTS,"Moreover, measures imposed by governments and ..."
155514,ZTS,The COVID-19 pandemic has also significantly i...


In [3]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,ticker,text
0,A,Both our domestic and international operation...
1,A,"months ended April 30, 2020 is dependent in p..."
2,A,", we did not identify any triggering events or..."
3,A,of our common stock under this program. On Ma...
4,A,The following discussion should be read in con...
...,...,...
155511,ZTS,The novel coronavirus (COVID-19) was identifie...
155512,ZTS,The COVID-19 pandemic also may reduce demand f...
155513,ZTS,"Moreover, measures imposed by governments and ..."
155514,ZTS,The COVID-19 pandemic has also significantly i...


In [4]:
english_stop_words = stopwords.words('english')

In [5]:
additional_stop_words = ["covid", "pandemic", "novel", "coronavirus", "global", 
                        "million", "thousand", "hundred", "billion", "quarter", "year", 
                        "january", "february", "march", "april", "may", "june", 
                        "july", "august", "september", "october", "november", "december",
                        "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", 
                         "dec", "company", "business", "us", "also", "could"]

In [6]:
all_stop_words = english_stop_words + additional_stop_words

In [7]:
def remove_numbers(text):
    return re.sub(r'\d+','', text)

In [8]:
df_clean = df.copy()
df_clean["text"] = df_clean.text.apply(remove_numbers)
df_clean

Unnamed: 0,ticker,text
0,A,Both our domestic and international operation...
1,A,"months ended April , is dependent in part on..."
2,A,", we did not identify any triggering events or..."
3,A,of our common stock under this program. On Ma...
4,A,The following discussion should be read in con...
...,...,...
155511,ZTS,The novel coronavirus (COVID-) was identified ...
155512,ZTS,The COVID- pandemic also may reduce demand for...
155513,ZTS,"Moreover, measures imposed by governments and ..."
155514,ZTS,The COVID- pandemic has also significantly inc...


In [17]:
vectorizer = TfidfVectorizer(stop_words=all_stop_words, min_df=10, ngram_range=(1,2))

doc_word = vectorizer.fit_transform(df_clean.text)
doc_word.shape

(12396, 11425)

In [21]:
nmf_model = NMF(8)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(12396, 8)

In [22]:
topic_word = nmf_model.components_
topic_word.shape

(8, 11425)

In [20]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-20:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['health',
  'including',
  'customers',
  'employees',
  'restrictions',
  'measures',
  'products',
  'spread',
  'travel',
  'demand',
  'world',
  'supply',
  'continue',
  'impact',
  'orders',
  'outbreak',
  'actions',
  'significant',
  'economic'],
 ['sales',
  'primarily',
  'due',
  'net',
  'lower',
  'offset',
  'compared',
  'partially',
  'primarily due',
  'increased',
  'partially offset',
  'months',
  'driven',
  'months ended',
  'ended',
  'three',
  'decrease',
  'revenue',
  'net sales'],
 ['statements',
  'forward looking',
  'looking',
  'forward',
  'looking statements',
  'factors',
  'risks',
  'report',
  'risk',
  'form',
  'risk factors',
  'report form',
  'uncertainties',
  'item',
  'risks uncertainties',
  'information',
  'future',
  'estimates',
  'actual'],
 ['results',
  'operations',
  'results operations',
  'financial',
  'condition',
  'financial condition',
  'adversely',
  'operations financial',
  'impact',
  'condition results',
  'advers

In [99]:
df['topic'] = doc_topic.argmax(axis=1)
df

Unnamed: 0,ticker,text,topic
0,A,Both our domestic and international operation...,4
1,A,"months ended April 30, 2020 is dependent in p...",6
2,A,", we did not identify any triggering events or...",5
3,A,of our common stock under this program. On Ma...,3
4,A,The following discussion should be read in con...,1
...,...,...,...
155511,ZTS,The novel coronavirus (COVID-19) was identifie...,4
155512,ZTS,The COVID-19 pandemic also may reduce demand f...,4
155513,ZTS,"Moreover, measures imposed by governments and ...",0
155514,ZTS,The COVID-19 pandemic has also significantly i...,3


In [100]:
multivalue_dict = defaultdict(list)
for inx, row in df.iterrows():
    multivalue_dict[row['topic']].append(row['text'])

In [101]:
multivalue_dict[6]

[' months ended April 30, 2020 is dependent in part on forecasts of full year results. The impact of the COVID-19 outbreak on the economic environment is uncertain and may change these forecasts, which could impact tax expense.',
 ' months ended April 30, 2020, is dependent in part on forecasts of full year results. The impact of the COVID-19 outbreak to the economic environment is uncertain and may change these forecasts, which could impact tax expense.',
 'Economic stimulus legislation was passed in many countries in response to COVID-19. In March in the U.S., the ',
 ' that have materially affected, or are reasonably likely to materially affect, our internal control over financial reporting. We are continually monitoring and assessing the effect of the COVID-19 situation on our internal controls to minimize the impact on their design and operating effectiveness.',
 'The global spread of COVID-19 is having, and will continue to have, an adverse impact on our operations, sales and del