In [141]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.lancaster import LancasterStemmer

In [142]:
df = pd.read_pickle("text_df_v2.pkl")
df

Unnamed: 0,ticker,text
0,A,Both our domestic and international operation...
1,A,"months ended April 30, 2020 is dependent in p..."
2,A,", we did not identify any triggering events or..."
3,A,of our common stock under this program. On Ma...
4,A,The following discussion should be read in con...
...,...,...
155511,ZTS,The novel coronavirus (COVID-19) was identifie...
155512,ZTS,The COVID-19 pandemic also may reduce demand f...
155513,ZTS,"Moreover, measures imposed by governments and ..."
155514,ZTS,The COVID-19 pandemic has also significantly i...


In [143]:
english_stop_words = stopwords.words('english')

In [144]:
additional_stop_words = ["2020", "2021", "2019", "covid", "pandemic", "novel", "coronavirus", "global", 
                        "million", "thousand", "hundred", "billion", "quarter", "year", "'19'",
                        "'20", "19", "january", "february", "march", "april", "may", "june", 
                        "july", "august", "september", "october", "november", "december",
                        "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", 
                         "dec"]

In [145]:
all_stop_words = english_stop_words + additional_stop_words

In [146]:
def remove_numbers(text):
    return re.sub(r'\d+','', text)

In [147]:
df_clean = df.copy()
df_clean["text"] = df_clean.text.apply(remove_numbers)
df_clean

Unnamed: 0,ticker,text
0,A,Both our domestic and international operation...
1,A,"months ended April , is dependent in part on..."
2,A,", we did not identify any triggering events or..."
3,A,of our common stock under this program. On Ma...
4,A,The following discussion should be read in con...
...,...,...
155511,ZTS,The novel coronavirus (COVID-) was identified ...
155512,ZTS,The COVID- pandemic also may reduce demand for...
155513,ZTS,"Moreover, measures imposed by governments and ..."
155514,ZTS,The COVID- pandemic has also significantly inc...


In [148]:
vectorizer = CountVectorizer(stop_words = all_stop_words, min_df = 5, ngram_range=(1,2))

doc_word = vectorizer.fit_transform(df_clean.text)
doc_word.shape



(22793, 50598)

In [149]:
nmf_model = NMF(8)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(22793, 8)

In [150]:
topic_word = nmf_model.components_
topic_word.shape

(8, 50598)

In [151]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-15:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['business',
  'including',
  'could',
  'products',
  'customers',
  'result',
  'ability',
  'employees',
  'operations',
  'demand',
  'services',
  'health',
  'supply',
  'restrictions'],
 ['statements',
  'forward',
  'looking',
  'forward looking',
  'looking statements',
  'risks',
  'future',
  'results',
  'factors',
  'including',
  'report',
  'uncertainties',
  'risks uncertainties',
  'changes'],
 ['sales',
  'due',
  'net',
  'primarily',
  'increased',
  'increase',
  'lower',
  'first',
  'offset',
  'compared',
  'revenue',
  'related',
  'months',
  'higher'],
 ['company',
  'impact company',
  'related',
  'company business',
  'financial',
  'certain',
  'company ability',
  'share',
  'assets',
  'tax',
  'future',
  'expects',
  'act',
  'time'],
 ['financial',
  'operations',
  'results',
  'results operations',
  'business',
  'condition',
  'financial condition',
  'could',
  'adversely',
  'affect',
  'operations financial',
  'material',
  'adverse',
  'con