# Part II. Text Preprocessing 

#### **Stage 3:** Preprocessing texts in MD&A 
- Extracting NPs
- Identify causes and effects 
- Constrain to sales growth, profit margins, etc. 
- Data Structure
- EDA visualiation

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import os
from ast import literal_eval # pandas store list as string; need to convert back
from collections import Counter
import spacy
nlp = spacy.load("en_core_web_sm")

# !pip install wordcloud
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('max_colwidth',None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 0)




In [2]:
# Load samples

data_root = "C:/Users/clair/Desktop/Thesis/masterThesis2022/Data/"
df = pd.read_pickle(data_root+"sample2_sentences.pkl")
#df['type'].replace("10Q","10-Q", inplace=True)
#df['type'].replace("10K","10-K", inplace=True)


In [3]:
df.head()

Unnamed: 0,ticker,type,file,ix,start,end,word_counter,VERB,PNOUN,NOUN,ADJ,ADV,ADP,CONJ,CCONJ,SCONJ,sent_len,causal_verbs,causal_phrases
0,OGN,10-Q,0001821825-21-000009,1,132687,176620,"{'VERB': [('include', 36), ('reflect', 24), ('relate', 22), ('offset', 18), ('increase', 16), ('continue', 15), ('base', 14), ('compare', 14), ('expect', 12), ('provide', 12), ('affect', 11), ('decline', 11), ('establish', 11), ('look', 9), ('denominate', 9), ('market', 8), ('condense', 8), ('end', 8), ('decrease', 8), ('•', 7), ('use', 7), ('make', 6), ('result', 6), ('receive', 6), ('pay', 6), ('enter', 6), ('secure', 6), ('see', 6), ('operate', 5), ('record', 5), ('issue', 5), ('drive', 5), ('identify', 4), ('follow', 4), ('incur', 4), ('fund', 4), ('regard', 3), ('cause', 3), ('differ', 3), ('consider', 3), ('know', 3), ('guarantee', 3), ('vary', 3), ('agree', 3), ('develop', 3), ('combine', 3), ('distribute', 3), ('remain', 3), ('assume', 3), ('lower', 3), ('toll', 3), ('restructure', 3), ('work', 3)], 'PNOUN': [], 'NOUN': [('sale', 73), ('%', 61), ('month', 61), ('quarter', 57), ('product', 43), ('cost', 27), ('operation', 23), ('statement', 21), ('impact', 21), ('note', 21), ('demand', 21), ('result', 20), ('competition', 20), ('cash', 20), ('pandemic', 19), ('increase', 19), ('market', 17), ('health', 16), ('tax', 15), ('treatment', 15), ('decline', 15), ('rate', 14), ('business', 12), ('term', 12), ('period', 12), ('agreement', 12), ('payment', 11), ('stock', 11), ('amount', 11), ('separation', 10), ('expense', 10), ('award', 10), ('income', 10), ('brand', 9), ('change', 9), ('benefit', 9), ('accounting', 9), ('recovery', 9), ('uncertainty', 8), ('employee', 8), ('commercialization', 8), ('country', 8), ('interest', 8), ('share', 8), ('connection', 8), ('right', 8), ('arrangement', 8), ('development', 7), ('company', 7), ('woman', 7), ('performance', 7), ('care', 7), ('biosimilar', 7), ('year', 7), ('capital', 7), ('risk', 6), ('growth', 6), ('factor', 6), ('patent', 6), ('loss', 6), ('law', 6), ('basis', 6), ('asset', 6), ('loan', 6), ('euro', 6), ('excess', 6), ('option', 6), ('volume', 6), ('decrease', 6), ('medicine', 6), ('estimate', 5), ('acquisition', 5), ('pricing', 5), ('liability', 5), ('portfolio', 5), ('facility', 5), ('indenture', 5), ('borrowing', 5), ('portion', 5), ('debt', 5), ('revenue', 5), ('launch', 5), ('cancer', 5), ('timing', 5), ('examination', 5), ('financing', 5), ('activity', 5), ('report', 4), ('time', 4), ('management', 4), ('plan', 4), ('difficulty', 4), ('physician', 4), ('provider', 4), ('date', 4), ('discussion', 4), ('manufacturing', 4), ('transaction', 4), ('milestone', 4), ('value', 4), ...], 'ADJ': [('second', 59), ('first', 55), ('low', 26), ('due', 22), ('generic', 20), ('high', 18), ('other', 17), ('certain', 15), ('global', 11), ('senior', 11), ('financial', 10), ('such', 10), ('net', 10), ('subject', 8), ('foreign', 8), ('worldwide', 8), ('principal', 8), ('aggregate', 7), ('-', 7), ('future', 6), ('ongoing', 6), ('attributable', 6), ('metastatic', 6), ('non', 6), ('current', 5), ('historical', 5), ('standalone', 5), ('secured', 5), ('negative', 4), ('biosimilar', 4), ('commercial', 4), ('significant', 4), ('new', 4), ('regulatory', 4), ('various', 4), ('effective', 4), ('comparable', 4), ('favorable', 4), ('gross', 4), ('actual', 3), ('third', 3), ('medical', 3), ('outstanding', 3), ('pursuant', 3), ('adjusted', 3), ('available', 3), ('cardiovascular', 3), ('vaginal', 3), ('several', 3), ('inflammatory', 3), ('discontinued', 3), ('fair', 3)], 'ADV': [('primarily', 33), ('well', 16), ('as', 15), ('partially', 15), ('respectively', 12), ('also', 9), ('forward', 8), ('approximately', 7), ('materially', 4), ('mainly', 4), ('otherwise', 3), ('negatively', 3), ('recently', 3), ('additionally', 3), ('outside', 3)], 'ADP': [('of', 371), ('in', 264), ('to', 147), ('for', 89), ('by', 48), ('with', 40), ('on', 39), ('from', 35), ('due', 30), ('under', 17), ('during', 17), ('at', 12), ('outside', 8), ('up', 6), ('within', 5), ('over', 5), ('through', 5), ('into', 5), ('across', 4)], 'CONJ': [], 'CCONJ': [('and', 262), ('or', 32), ('both', 4)], 'SCONJ': [('as', 59), ('upon', 7), ('that', 5)]}","{'include': 36, 'reflect': 24, 'relate': 22, 'offset': 18, 'increase': 16, 'continue': 15, 'base': 14, 'compare': 14, 'expect': 12, 'provide': 12, 'affect': 11, 'decline': 11, 'establish': 11, 'look': 9, 'denominate': 9, 'market': 8, 'condense': 8, 'end': 8, 'decrease': 8, '•': 7, 'use': 7, 'make': 6, 'result': 6, 'receive': 6, 'pay': 6, 'enter': 6, 'secure': 6, 'see': 6, 'operate': 5, 'record': 5, 'issue': 5, 'drive': 5, 'identify': 4, 'follow': 4, 'incur': 4, 'fund': 4, 'regard': 3, 'cause': 3, 'differ': 3, 'consider': 3, 'know': 3, 'guarantee': 3, 'vary': 3, 'agree': 3, 'develop': 3, 'combine': 3, 'distribute': 3, 'remain': 3, 'assume': 3, 'lower': 3, 'toll': 3, 'restructure': 3, 'work': 3}",{},"{'sale': 73, '%': 61, 'month': 61, 'quarter': 57, 'product': 43, 'cost': 27, 'operation': 23, 'statement': 21, 'impact': 21, 'note': 21, 'demand': 21, 'result': 20, 'competition': 20, 'cash': 20, 'pandemic': 19, 'increase': 19, 'market': 17, 'health': 16, 'tax': 15, 'treatment': 15, 'decline': 15, 'rate': 14, 'business': 12, 'term': 12, 'period': 12, 'agreement': 12, 'payment': 11, 'stock': 11, 'amount': 11, 'separation': 10, 'expense': 10, 'award': 10, 'income': 10, 'brand': 9, 'change': 9, 'benefit': 9, 'accounting': 9, 'recovery': 9, 'uncertainty': 8, 'employee': 8, 'commercialization': 8, 'country': 8, 'interest': 8, 'share': 8, 'connection': 8, 'right': 8, 'arrangement': 8, 'development': 7, 'company': 7, 'woman': 7, 'performance': 7, 'care': 7, 'biosimilar': 7, 'year': 7, 'capital': 7, 'risk': 6, 'growth': 6, 'factor': 6, 'patent': 6, 'loss': 6, 'law': 6, 'basis': 6, 'asset': 6, 'loan': 6, 'euro': 6, 'excess': 6, 'option': 6, 'volume': 6, 'decrease': 6, 'medicine': 6, 'estimate': 5, 'acquisition': 5, 'pricing': 5, 'liability': 5, 'portfolio': 5, 'facility': 5, 'indenture': 5, 'borrowing': 5, 'portion': 5, 'debt': 5, 'revenue': 5, 'launch': 5, 'cancer': 5, 'timing': 5, 'examination': 5, 'financing': 5, 'activity': 5, 'report': 4, 'time': 4, 'management': 4, 'plan': 4, 'difficulty': 4, 'physician': 4, 'provider': 4, 'date': 4, 'discussion': 4, 'manufacturing': 4, 'transaction': 4, 'milestone': 4, 'value': 4, ...}","{'second': 59, 'first': 55, 'low': 26, 'due': 22, 'generic': 20, 'high': 18, 'other': 17, 'certain': 15, 'global': 11, 'senior': 11, 'financial': 10, 'such': 10, 'net': 10, 'subject': 8, 'foreign': 8, 'worldwide': 8, 'principal': 8, 'aggregate': 7, '-': 7, 'future': 6, 'ongoing': 6, 'attributable': 6, 'metastatic': 6, 'non': 6, 'current': 5, 'historical': 5, 'standalone': 5, 'secured': 5, 'negative': 4, 'biosimilar': 4, 'commercial': 4, 'significant': 4, 'new': 4, 'regulatory': 4, 'various': 4, 'effective': 4, 'comparable': 4, 'favorable': 4, 'gross': 4, 'actual': 3, 'third': 3, 'medical': 3, 'outstanding': 3, 'pursuant': 3, 'adjusted': 3, 'available': 3, 'cardiovascular': 3, 'vaginal': 3, 'several': 3, 'inflammatory': 3, 'discontinued': 3, 'fair': 3}","{'primarily': 33, 'well': 16, 'as': 15, 'partially': 15, 'respectively': 12, 'also': 9, 'forward': 8, 'approximately': 7, 'materially': 4, 'mainly': 4, 'otherwise': 3, 'negatively': 3, 'recently': 3, 'additionally': 3, 'outside': 3}","{'of': 371, 'in': 264, 'to': 147, 'for': 89, 'by': 48, 'with': 40, 'on': 39, 'from': 35, 'due': 30, 'under': 17, 'during': 17, 'at': 12, 'outside': 8, 'up': 6, 'within': 5, 'over': 5, 'through': 5, 'into': 5, 'across': 4}",{},"{'and': 262, 'or': 32, 'both': 4}","{'as': 59, 'upon': 7, 'that': 5}","[4, 12, 2, 1, 4, 38, 60, 58, 20, 37, 29, 39, 16, 7, 612, 12, 27, 65, 1, 32, 61, 16, 30, 36, 25, 52, 48, 43, 35, 27, 22, 4, 20, 28, 45, 22, 20, 23, 17, 21, 51, 25, 30, 32, 20, 46, 15, 1, 81, 12, 65, 24, 44, 110, 90, 70, 66, 12, 44, 3, 56, 49, 1, 24, 31, 26, 28, 28, 51, 72, 55, 15, 26, 74, 66, 28, 62, 21, 57, 50, 58, 18, 32, 10, 26, 24, 2, 73, 41, 15, 34, 38, 22, 45, 22, 23, 25, 25, 90, 49, ...]","[COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients, as well as social distancing measures have negatively affected our results. , The decline during the first six months of 2021 primarily reflects decreases across markets due to ongoing generic competition for products within the established brands business, particularly for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin), lower sales of respiratory products Singulair (montelukast), Dulera and Nasonex, and generic competition for women’s health product, Revenues for the second quarter of 2021 and the first six months of 2021 primarily reflect our share of the profits., Revenues for the second quarter and the first six months of 2020 reflect supply sales of the generic product to the manufacturer., Sales growth in the second quarter and first six months of 2021 was driven primarily by continued demand growth in the United States since launch in 2017, partially offset by higher discount rates., For the second quarter and first six months of 2021, sales reflect uptake since the July 2020 launch in the United States, partially offset by a decrease in the EU reflecting increasing competitive pressures and price erosion., The sales decrease in the first six months of 2021 reflects lower demand in the United States, lower demand due to generic competition in Japan and the Asia Pacific region, the effect of the shift in product mix and lower demand in China, and lower sales in Canada as sales in the second quarter of 2020 was higher due to competitor supply shortages., Sales for the first six months of 2021 -37- also reflect lower demand in Europe and Canada in the beginning of 2021 due to the COVID-19 pandemic., The increase during the period reflects increases in manufacturing costs absorbed by Organon, increase in stand up costs, and cost related to tolling arrangements with Merck, which were not in place during the comparable prior year period., The increase also reflects increases in direct corporate Organon costs., The gross margin declines reflect an increase in stand up costs, as well as certain costs related to tolling arrangements with Merck, which have lower gross margin percentages compared to product sales., Restructuring Costs Certain of our operations have been affected by restructuring plans initiated by Merck., The decrease in effective interest rates for the six months ended June 30, 2021 reflect the beneficial impact of foreign earnings, the $70 million tax benefit relating to a portion of the non-U.S. step-up of tax basis as well as the income tax benefit recognized in connection with the conclusion of the Internal Revenue Service (IRS) examination of Merck’s 2015-2016 U.S. federal income tax returns., As a result of the examination conclusion, we reflected an allocation from Merck of $18 million in the Condensed Consolidated Financial Statements representing our portion of the payment made to the IRS., This net benefit reflects reductions in reserves for unrecognized tax benefits and other related liabilities for tax positions relating to the years that were under examination. , Accordingly, the historical results of operations of the Merck Retained Products have been reflected as discontinued operations in the Condensed Consolidated Financial Statements for all periods presented., The decrease in net income for both periods reflects an increase in costs and expenses incurred to establish Organon as a standalone entity, partially offset by higher sales due to higher demand for certain of our products across several markets in the second quarter of 2021., The overall increase in working capital of continuing operations was primarily driven by cash funding by Merck in connection with the Separation, offset by an increase in current liabilities with Merck primarily for inventory purchases, as well as increases in employee benefits and payroll. , Cash provided by operating activities was favorably impacted by an increase in accounts payable, including amounts due to Merck, partially offset by a decline in net income., The change in cash used in financing activities reflects the proceeds from the issuance of long term debt, the payment of related debt issuance costs and the settlement of the transactions with Merck in connection with the Separation (see Note 17 to our Condensed Consolidated Financial Statements).]","[As a result, $56 million of foreign currency gains due to spot rate fluctuations on the euro-denominated debt instruments are included in foreign currency translation adjustment in Other Comprehensive Income for the three and six months ended June 30, 2021., Operating expenses in the second quarter and first six months of 2021 were higher primarily due to the effect of lower promotional and selling costs incurred in the second quarter and first six months of 2020 attributable to the COVID-19 pandemic as well as incremental costs associated with establishing Organon as a standalone company., Operating Results Sales Overview U.S. plus international may not equal total due to rounding ., The increase is primarily due to higher sales of women’s health products, including Nexplanon/Implanon NXT, Follistim AQ (follitropin beta injection) and Ganirelix Acetate Injection, as well as higher sales of biosimilar products resulting from the continued uptake of Renflexis (infliximab-abda) in the United States and the uptake of Aybintio (bevacizumab) in the European Union (""EU"")., The sales increase was partially offset by ongoing generic competition for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin) mainly in Japan, decline in sales due to the volume-based procurement program (the ""VBP"") in China, an expiration of distribution agreement in Korea for Rosuzet in December 2020, and decreased demand for Cozaar/Hyzaar., The decline during the first six months of 2021 primarily reflects decreases across markets due to ongoing generic competition for products within the established brands business, particularly for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin), lower sales of respiratory products Singulair (montelukast), Dulera and Nasonex, and generic competition for women’s health product, The sales decline was offset by higher sales of women's health products Nexplanon/Implanon NXT, Follistim AQ (follitropin beta injection) and Ganirelix Acetate Injection due to higher demand, and higher sales of biosimilars resulting from the continued uptake of Renflexis mainly in the United States and Aybintio in the EU., Worldwide sales of NuvaRing , a vaginal contraceptive product, declined 16% and 22% in the second quarter and first six months of 2021 primarily due to ongoing generic competition in the United States and the EU., The decline in revenue for the second quarter and first six months of 2021 is due to the entry of a new market participant., Fertility Worldwide sales of Follistim AQ (marketed in most countries outside the United States as Puregon), a fertility treatment, increased 48% and 37% in the second quarter and the first six months of 2021, respectively, primarily due to volume growth in the United States as well as recovery from the COVID-19 pandemic in the United States Europe, Canada and China, partially offset by overall unfavorable discount rates in the U.S. for the six months ended June 30, 2021., Worldwide sales of Ganirelix Acetate Injection (marketed in certain countries outside the United States as Orgalutran), a fertility treatment, increased 117% in the second quarter of 2021, primarily due to the recovery from the COVID-19 pandemic in Europe, Canada and China., For the first six months of 2021, sales increased 101% primarily due to increased demand in the United States as well as recovery from the COVID-19 pandemic in the United States, Europe, Canada and China., Sales in the second quarter and first six months of 2021 decreased 2% and 27%, respectively, primarily due to timing of shipments to Brazil related to government orders., We recorded sales of $8 million and $16 million during the second quarter and first six months of 2021, respectively, with no comparable sales during the second quarter and first six months of 2020 due to the approval of Aybintio in the EU in August 2020 and its launch in September 2020., Cardiovascular Combined global sales of Zetia (marketed in most countries outside of the United States as Ezetrol ) and Vytorin (marketed outside of the United States as Inegy ), medicines for lowering LDL cholesterol, declined 18% during the second quarter of 2021 primarily driven by lower sales of Ezetrol in Japan., Sales decreased 26% in the first six months of 2021 primarily driven by lower sales of Ezetrol in Japan, lower demand in the United States due to generic competition, as well as lower sales of Ezetrol and Inegy in the EU., Sales of Atozet (ezetimibe and atorvastatin calcium) (marketed outside of the United States), a medicine for lowering LDL cholesterol, increased 5% in the second quarter of 2021 primarily due to volume growth in France and a slight increase in sales in various markets, partially offset by lower demand in Germany due to competition., Sales of Atozet declined 2% in the first six months of 2021 due to lower demand in the EU, primarily in Germany and Spain due to competition, coupled with unfavorable pricing, partially offset by a volume increase in France and higher demand in the Asia Pacific region., Sales of Rosuzet (ezetimibe and rosuvastatin calcium) (marketed outside of the United States), a medicine for lowering LDL cholesterol, declined 42% and 47% in the second quarter and first six months of 2021, respectively, due to the expiration of a distribution agreement in Korea in December 2020., The decrease in the second quarter is primarily due to a shift in product and channel mix and lower demand in China as well as lower demand as a result of growing generic competition in Japan., The sales decrease in the first six months of 2021 reflects lower demand in the United States, lower demand due to generic competition in Japan and the Asia Pacific region, the effect of the shift in product mix and lower demand in China, and lower sales in Canada as sales in the second quarter of 2020 was higher due to competitor supply shortages., Sales of Zocor decreased 21% in the first six months of 2021, primarily due to lower volumes in China due to the VBP impact., Respiratory Worldwide sales of Singulair , a once-a-day oral medicine for the chronic treatment of asthma and for the relief of symptoms of allergic rhinitis, declined 8% in the second quarter of 2021 primarily due to the lower performance in Japan attributable to generic competition as well as timing of shipments in Japan in the second quarter of 2020, VBP impact in China partially offset by the recovery from the COVID-19 pandemic. , Singulair sales in the first six months of 2021 decreased 22% primarily attributable to the impact of VBP in China, lower volume in Japan due to generic competition as well as the timing of shipments, and ongoing impact of the COVID-19 pandemic in the Asia Pacific region., Sales for the first six months of 2021 -37- also reflect lower demand in Europe and Canada in the beginning of 2021 due to the COVID-19 pandemic., Global sales of Nasonex , an inhaled nasal corticosteroid for the treatment of nasal allergy symptoms, increased 6% in the second quarter of 2021 primarily due to higher demand in China and favorable performance in Russia, partially offset by lower sales in the United States due to the impact of COVID-19 pandemic and generic competition in Japan., Global sales of Nasonex decreased 21% in the first six months of 2021 primarily driven by lower demand impacted by the COVID-19 pandemic across several markets in the United States, Europe, and Latin America, and generic competition in Japan, partially offset by higher demand in China., Global sales of Dulera , a combination medicine for the treatment of asthma, increased 34% in the second quarter of 2021 primarily due to favorable discount rates in the United States., For the first six months of 2021, global sales of Dulera decreased 26% largely due to significant buy-in in the six months of 2020 related to the COVID-19 pandemic, partially offset by the favorable discount rates in the United States in the second quarter of 2021., Non-Opioid Pain, Bone and Dermatology Sales of Arcoxia (etoricoxib), for the treatment of arthritis and pain, were slightly lower in the second quarter of 2021 primarily due to the impact of VBP in China., Sales of Arcoxia for the first six months of 2021 decreased 12% primarily due to the impact of VBP in China and lower demand in the Asia Pacific region attributable to the COVID-19 pandemic., Other Worldwide sales of Proscar , for the treatment of symptomatic benign prostate enlargement, declined 38% and 32% in the second quarter and first six months of 2021 primarily due to lower performance reflecting the impact of VBP in China., The increase in cost of sales for the second quarter is primarily due to increases in manufacturing costs and certain costs related to tolling arrangements with Merck which were not in place in the second quarter of 2020., Selling, General and Administrative Selling, general and administrative expenses increased 46% and 33% in the second quarter of 2021 and the first six months of 2021, respectively, due to costs incurred to establish Organon as a standalone entity, higher employee related costs, and higher selling and promotional costs., The decline in restructuring costs is due to lower allocated costs from Merck during the second quarter and the first six months of 2021 compared to the comparable periods of 2020., The decrease in net income for both periods reflects an increase in costs and expenses incurred to establish Organon as a standalone entity, partially offset by higher sales due to higher demand for certain of our products across several markets in the second quarter of 2021., The overall increase in working capital of continuing operations was primarily driven by cash funding by Merck in connection with the Separation, offset by an increase in current liabilities with Merck primarily for inventory purchases, as well as increases in employee benefits and payroll. , Cash provided by operating activities was favorably impacted by an increase in accounts payable, including amounts due to Merck, partially offset by a decline in net income., A discussion of accounting estimates considered critical because of the potential for a significant impact on the financial statements due to the inherent uncertainty in such estimates are disclosed in the Critical Accounting Estimates section of Management’s Discussion and Analysis of Financial Condition and Results of Operations included in Organon's Form 10.]"
1,OGN,10-Q,0001821825-21-000005,1,95825,123717,"{'VERB': [('compare', 25), ('reflect', 22), ('decline', 17), ('include', 13), ('continue', 12), ('relate', 10), ('establish', 9), ('combine', 8), ('provide', 8), ('drive', 8), ('offset', 8), ('expect', 8), ('increase', 8), ('affect', 7), ('enter', 6), ('denominate', 6), ('see', 6), ('market', 6), ('receive', 5), ('condense', 5), ('issue', 5), ('record', 5), ('allocate', 5), ('work', 5), ('operate', 4), ('base', 4), ('secure', 4), ('result', 4), ('end', 4), ('use', 4), ('pay', 3), ('assume', 3), ('revolve', 3), ('contribute', 3), ('couple', 3), ('lower', 3), ('incur', 3), ('restructure', 3), ('discontinue', 3), ('fund', 3)], 'PNOUN': [], 'NOUN': [('quarter', 69), ('sale', 51), ('%', 37), ('cost', 22), ('product', 20), ('operation', 19), ('cash', 18), ('note', 17), ('statement', 14), ('income', 14), ('treatment', 14), ('impact', 14), ('decline', 14), ('result', 13), ('pandemic', 13), ('expense', 12), ('demand', 12), ('market', 11), ('tax', 11), ('increase', 10), ('agreement', 9), ('term', 9), ('competition', 9), ('arrangement', 9), ('month', 9), ('capital', 9), ('accounting', 8), ('amount', 8), ('rate', 8), ('activity', 8), ('share', 7), ('year', 7), ('brand', 7), ('commercialization', 7), ('decrease', 7), ('part', 6), ('excess', 6), ('biosimilar', 6), ('country', 6), ('medicine', 6), ('benefit', 6), ('connection', 5), ('company', 5), ('subsidiary', 5), ('-', 5), ('indenture', 5), ('loan', 5), ('employee', 5), ('health', 5), ('revenue', 5), ('right', 5), ('cancer', 5), ('entity', 5), ('examination', 5), ('estimate', 5), ('business', 4), ('flow', 4), ('transaction', 4), ('payment', 4), ('borrowing', 4), ('patient', 4), ('manufacturer', 4), ('launch', 4), ('cholesterol', 4), ('change', 4), ('financing', 4), ('distribution', 3), ('holder', 3), ('basis', 3), ('position', 3), ('period', 3), ('postpartum', 3), ('management', 3), ('issuer', 3), ('credit', 3), ('i', 3), ('dollar', 3), ('facility', 3), ('interest', 3), ('floor', 3), ('step', 3), ('portion', 3), ('portfolio', 3), ('physician', 3), ('loss', 3), ('exclusivity', 3), ('volume', 3), ('fertility', 3), ('addition', 3), ('profit', 3), ('development', 3), ('disease', 3), ('party', 3), ('margin', 3), ('net', 3), ('taxis', 3), ('liability', 3), ('separation', 3), ('need', 3)], 'ADJ': [('first', 75), ('low', 19), ('financial', 17), ('due', 13), ('generic', 13), ('certain', 10), ('other', 10), ('senior', 7), ('global', 7), ('high', 7), ('subject', 6), ('net', 6), ('metastatic', 6), ('-', 6), ('standalone', 5), ('combined', 5), ('principal', 5), ('worldwide', 5), ('non', 5), ('pursuant', 4), ('aggregate', 4), ('attributable', 4), ('related', 4), ('such', 3), ('historical', 3), ('second', 3), ('respiratory', 3), ('cardiovascular', 3), ('vaginal', 3), ('most', 3), ('inflammatory', 3), ('gross', 3), ('foreign', 3), ('discontinued', 3), ('secured', 3), ('critical', 3), ('significant', 3)], 'ADV': [('primarily', 22), ('partially', 8), ('well', 6), ('as', 5), ('approximately', 5), ('additionally', 4), ('particularly', 4), ('respectively', 4), ('also', 3), ('largely', 3), ('outside', 3)], 'ADP': [('of', 254), ('in', 177), ('to', 105), ('for', 54), ('by', 35), ('with', 31), ('on', 21), ('from', 18), ('due', 18), ('under', 14), ('during', 14), ('at', 12), ('outside', 7), ('into', 5), ('after', 3), ('within', 3)], 'CONJ': [], 'CCONJ': [('and', 128), ('or', 8)], 'SCONJ': [('as', 39), ('upon', 4), ('that', 3)]}","{'compare': 25, 'reflect': 22, 'decline': 17, 'include': 13, 'continue': 12, 'relate': 10, 'establish': 9, 'combine': 8, 'provide': 8, 'drive': 8, 'offset': 8, 'expect': 8, 'increase': 8, 'affect': 7, 'enter': 6, 'denominate': 6, 'see': 6, 'market': 6, 'receive': 5, 'condense': 5, 'issue': 5, 'record': 5, 'allocate': 5, 'work': 5, 'operate': 4, 'base': 4, 'secure': 4, 'result': 4, 'end': 4, 'use': 4, 'pay': 3, 'assume': 3, 'revolve': 3, 'contribute': 3, 'couple': 3, 'lower': 3, 'incur': 3, 'restructure': 3, 'discontinue': 3, 'fund': 3}",{},"{'quarter': 69, 'sale': 51, '%': 37, 'cost': 22, 'product': 20, 'operation': 19, 'cash': 18, 'note': 17, 'statement': 14, 'income': 14, 'treatment': 14, 'impact': 14, 'decline': 14, 'result': 13, 'pandemic': 13, 'expense': 12, 'demand': 12, 'market': 11, 'tax': 11, 'increase': 10, 'agreement': 9, 'term': 9, 'competition': 9, 'arrangement': 9, 'month': 9, 'capital': 9, 'accounting': 8, 'amount': 8, 'rate': 8, 'activity': 8, 'share': 7, 'year': 7, 'brand': 7, 'commercialization': 7, 'decrease': 7, 'part': 6, 'excess': 6, 'biosimilar': 6, 'country': 6, 'medicine': 6, 'benefit': 6, 'connection': 5, 'company': 5, 'subsidiary': 5, '-': 5, 'indenture': 5, 'loan': 5, 'employee': 5, 'health': 5, 'revenue': 5, 'right': 5, 'cancer': 5, 'entity': 5, 'examination': 5, 'estimate': 5, 'business': 4, 'flow': 4, 'transaction': 4, 'payment': 4, 'borrowing': 4, 'patient': 4, 'manufacturer': 4, 'launch': 4, 'cholesterol': 4, 'change': 4, 'financing': 4, 'distribution': 3, 'holder': 3, 'basis': 3, 'position': 3, 'period': 3, 'postpartum': 3, 'management': 3, 'issuer': 3, 'credit': 3, 'i': 3, 'dollar': 3, 'facility': 3, 'interest': 3, 'floor': 3, 'step': 3, 'portion': 3, 'portfolio': 3, 'physician': 3, 'loss': 3, 'exclusivity': 3, 'volume': 3, 'fertility': 3, 'addition': 3, 'profit': 3, 'development': 3, 'disease': 3, 'party': 3, 'margin': 3, 'net': 3, 'taxis': 3, 'liability': 3, 'separation': 3, 'need': 3}","{'first': 75, 'low': 19, 'financial': 17, 'due': 13, 'generic': 13, 'certain': 10, 'other': 10, 'senior': 7, 'global': 7, 'high': 7, 'subject': 6, 'net': 6, 'metastatic': 6, '-': 6, 'standalone': 5, 'combined': 5, 'principal': 5, 'worldwide': 5, 'non': 5, 'pursuant': 4, 'aggregate': 4, 'attributable': 4, 'related': 4, 'such': 3, 'historical': 3, 'second': 3, 'respiratory': 3, 'cardiovascular': 3, 'vaginal': 3, 'most': 3, 'inflammatory': 3, 'gross': 3, 'foreign': 3, 'discontinued': 3, 'secured': 3, 'critical': 3, 'significant': 3}","{'primarily': 22, 'partially': 8, 'well': 6, 'as': 5, 'approximately': 5, 'additionally': 4, 'particularly': 4, 'respectively': 4, 'also': 3, 'largely': 3, 'outside': 3}","{'of': 254, 'in': 177, 'to': 105, 'for': 54, 'by': 35, 'with': 31, 'on': 21, 'from': 18, 'due': 18, 'under': 14, 'during': 14, 'at': 12, 'outside': 7, 'into': 5, 'after': 3, 'within': 3}",{},"{'and': 128, 'or': 8}","{'as': 39, 'upon': 4, 'that': 3}","[4, 58, 48, 43, 27, 45, 45, 4, 27, 27, 28, 43, 22, 10, 1, 81, 71, 16, 43, 104, 86, 70, 66, 12, 30, 1, 1, 23, 32, 26, 28, 28, 50, 53, 28, 2, 28, 46, 31, 48, 28, 63, 16, 2, 65, 36, 15, 33, 38, 22, 28, 15, 18, 23, 40, 23, 39, 1, 22, 10, 13, 22, 24, 20, 30, 31, 14, 20, 30, 23, 62, 44, 28, 24, 22, 23, 30, 16, 11, 17, 46, 56, 20, 22, 17, 24, 12, 65, 46, 13, 79, 40, 1, 47, 27, 34, 27, 33, 26, 20, ...]","[The condensed combined financial statements reflect the Company’s financial position, results of operations and cash flows as it was operated as part of Merck prior to the Separation, in conformity with U.S. generally accepted accounting principles (""GAAP"")., Although COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients, as well as social distancing measures have negatively affected our results. , Revenues for the first quarter of 2021 primarily reflect our share of the profits., Revenues for the first quarter of 2020 reflect supply sales of the generic product to the manufacturer., The growth was driven primarily by increased demand in the United States and China coupled with favorable pricing in the United States., Sales growth in the first quarter of 2021 was driven primarily by continued increased demand in the United States since launch in 2017., Sales reflect uptake in the United States during the first quarter of 2021, partially offset by a decrease in the EU as compared to the first quarter of 2020., The decline was primarily driven by lower sales of Ezetrol in Japan and Ezetrol and Inegy in the EU., The sales decline largely reflects impact of VBP in China and lower demand in Europe, Japan and Canada attributable in part to the COVID-19 pandemic., The sales decline was driven primarily by lower demand impacted by the COVID-19 pandemic across several markets including Europe, Canada, Russia and Latin America., The sales decline was driven largely by the COVID-19 pandemic coupled with unfavorable discount rates in the United States in the first quarter of 2021., The increase in cost of sales primarily reflects costs absorbed by the Organon entities for related party tolling arrangements with Merck which were not in place during the first quarter of 2020., Additionally, the increase in cost of sales reflects increases in direct corporate Organon costs partially offset by decreases in divisional costs across markets driven by lower sales and lower allocated costs., The gross margin decline reflects an increase in stand up costs, including certain costs related to related party tolling arrangements with Merck, which have lower gross margin percentages compared to product sales., Restructuring Costs Certain of our operations have been affected by restructuring plans initiated by Merck., Other (income) expense, reflects foreign exchange (gains) and losses and other Merck allocated expenses., The change in other expense during the first quarter of 2021 reflects foreign exchange gains and a decrease in employee related allocated expenses., The effective income tax rates of 15.5% and 13.0% for the first quarter of 2021 and 2020, respectively, reflect the beneficial impact of foreign earnings., As a result of the examination conclusion in the first quarter of 2021, we reflected an allocation from Merck of $18 million in the combined financial statements representing our portion of the payment made to the IRS., This net benefit reflects reductions in reserves for unrecognized tax benefits and other related liabilities for tax positions relating to the years that were under examination., Accordingly, the historical results of operations of the Merck Retained Products have been reflected as discontinued operations in the condensed combined financial statements for all periods presented., The decrease in net income reflects lower sales in part due to the COVID 19 pandemic in addition to an increase in costs and expenses incurred to establish Organon as a standalone entity., Cash provided by operating activities was favorably impacted by an increase in net trade payables with Merck, partially offset by a decline in net income resulting from lower sales and an increase in costs related to the stand-up of Organon., The change in cash used in financing activities reflects transactions with Merck (see Note 15 to our combined financial statements).]","[Operating expenses in the first quarter of 2021 were lower due to the COVID-19 pandemic, primarily driven by lower promotional and selling costs as discussed below., The decline is primarily due to ongoing generic competition for products within the established brands business, particularly for respiratory products Singulair, Dulera and Nasonex, and cardiovascular products Zetia and Vytorin, as well as generic competition for women’s health product NuvaRing., The sales decline was partially offset by revenue resulting from an arrangement for the sale of generic etonogestrel/ethinyl estradiol vaginal ring, higher sales of fertility products Follistim AQ and Orgalutran due to higher demand, and higher sales of biosimilars resulting from the continued uptake of Renflexis mainly in the United States and Aybintio in certain markets in Europe., Health -27- Contraception Worldwide sales of Nexplanon/Implanon NXT, a single-rod subdermal contraceptive implant, declined 6% in the first quarter of 2021, primarily driven by lower demand in the United States and the U.K. attributable to the COVID-19 pandemic and Latin America attributable to tender delays that is expected to recover in the second half of 2021., Worldwide sales of NuvaRing, a vaginal contraceptive product, declined 28% in the first quarter of 2021 due to generic competition in most markets, particularly in the EU and the United States., Worldwide sales of Orgalutran, a fertility treatment, increased 85% in the first quarter of 2021 compared to the first quarter of 2020 primarily due to increased demand and favorable discount rates in the United States., Sales in the first quarter of 2021 decreased 42% compared to the first quarter of 2020 primarily due to shipments in Brazil in 2020 related to government orders., We recorded sales of $8 -28- million during the first quarter of 2021 with no comparable sales during the first quarter of 2020 due to the approval of Aybintio in the EU in August 2020 and the launch in September 2020., The decline was primarily driven by lower sales of Ezetrol in Japan and Ezetrol and Inegy in the EU., Sales of Atozet (marketed outside of the United States), a medicine for lowering LDL cholesterol, declined 9% in the first quarter of 2021 compared to the first quarter of 2020 due to lower demand in the EU, primarily in Germany, coupled with unfavorable pricing, partially offset by higher demand in France and the Asia Pacific region., Sales of Rosuzet (marketed outside of the United States), a medicine for lowering LDL cholesterol, declined 52% in the first quarter of 2021 compared to the first quarter of 2020 due to the expiration of a distribution agreement in Korea., Combined global sales of Cozaar, and its companion agent Hyzaar (a combination of Cozaar and hydrochlorothiazide that is marketed in Japan as Preminent), a medicine for the treatment of hypertension, declined 12% in the first quarter of 2021 compared to the first quarter of 2020 primarily due to lower demand in the United States and Canada due to generic competition, a slight decrease due to the impact of the COVID-19 pandemic., Worldwide sales of Zocor, a statin for modifying cholesterol, declined 38% in the first quarter of 2021 compared to the first quarter of 2020 primarily due to lower volumes in China due to the VBP impact., Non-Opioid Pain, Bone and Dermatology Sales of Arcoxia , for the treatment of arthritis and pain, declined 20% in the first quarter of 2021 compared with the first quarter of 2020 primarily due to the impact of the COVID-19 pandemic in the Asia Pacific region, and lower demand in certain markets in the Middle East and Russia., Other Worldwide sales of Proscar, for the treatment of symptomatic benign prostate enlargement, declined 25% in the first quarter of 2021 compared with the first quarter of 2020 primarily due to lower volumes reflecting the impact of VBP in China., Additionally, the increase in cost of sales reflects increases in direct corporate Organon costs partially offset by decreases in divisional costs across markets driven by lower sales and lower allocated costs., Selling, General and Administrative Selling, general and administrative (""SG&A"") expenses increased 21% in the first quarter of 2021 due to costs incurred to establish Organon as a standalone entity and higher employee related costs, partially offset by lower selling and promotional costs, as well as lower travel and meeting expenses, due in part to the impact of the COVID-19 pandemic., The decline in restructuring costs is due to lower allocated costs from Merck during the first quarter of 2021 as compared to the first quarter of 2020 (see Note 4 to our combined financial statements)., The increase in the effective income tax rate in the first quarter of 2021 as compared to the first quarter of 2020 is primarily due to a change in our global mix of income which was partially offset by the income tax benefit recognized in connection with the conclusion of the Internal Revenue Service (IRS) examination of Merck’s 2015-2016 U.S. federal income tax returns., The decrease in net income reflects lower sales in part due to the COVID 19 pandemic in addition to an increase in costs and expenses incurred to establish Organon as a standalone entity., The overall decrease in working capital of continuing operations was primarily due to an increase in related party current liabilities and an increase in accrued employee benefits and payroll., A discussion of accounting estimates considered critical because of the potential for a significant impact on the financial statements due to the inherent uncertainty in such estimates are disclosed in the Critical Accounting Estimates section of Management’s Discussion and Analysis of Financial Condition and Results of Operations included in the Company's Form 10.]"
2,NI,10-K,0001174947-21-000255,1,257209,0,,{},{},{},{},{},{},{},{},{},[],[],[]
3,NI,10-K,0001111711-21-000010,1,191157,259177,"{'VERB': [('relate', 44), ('continue', 43), ('offset', 38), ('include', 34), ('operate', 24), ('increase', 23), ('refer', 21), ('compare', 18), ('reflect', 16), ('report', 16), ('decrease', 16), ('expect', 15), ('provide', 15), ('require', 15), ('end', 15), ('result', 15), ('use', 15), ('drive', 15), ('make', 14), ('record', 14), ('see', 13), ('base', 11), ('bill', 10), ('follow', 9), ('purchase', 9), ('determine', 8), ('remain', 8), ('defer', 8), ('place', 7), ('experience', 7), ('account', 7), ('associate', 7), ('calculate', 7), ('retire', 7), ('utilize', 6), ('project', 6), ('allow', 6), ('impact', 6), ('revolve', 6), ('manage', 6), ('pass', 6), ('recover', 6), ('execute', 6), ('estimate', 6), ('define', 5), ('monitor', 5), ('begin', 5), ('regard', 5), ('approve', 5), ('invest', 5), ('present', 5), ('track', 5), ('discuss', 5), ('sell', 5), ('evaluate', 5), ('meet', 5), ('apply', 5), ('design', 4), ('deliver', 4), ('pay', 4), ('take', 4), ('receive', 4), ('help', 4), ('believe', 4), ('reduce', 4), ('incur', 4), ('complete', 4), ('recognize', 4), ('establish', 4), ('comprise', 4), ('hold', 4), ('change', 4), ('depend', 4), ('cool', 4), ('construct', 4), ('issue', 4), ('show', 4), ('expose', 4), ('regulate', 4), ('implement', 3), ('work', 3), ('enter', 3), ('plan', 3), ('enact', 3), ('support', 3), ('anticipate', 3), ('spend', 3), ('•', 3), ('occur', 3), ('attribute', 3), ('match', 3), ('indicate', 3), ('represent', 3), ('fire', 3), ('fund', 3), ('describe', 3), ('exceed', 3), ('contain', 3), ('identify', 3), ('involve', 3), ...], 'PNOUN': [], 'NOUN': [('rate', 69), ('cost', 66), ('revenue', 48), ('customer', 47), ('expense', 46), ('note', 45), ('operating', 40), ('term', 38), ('income', 33), ('credit', 33), ('risk', 32), ('capital', 32), ('weather', 32), ('change', 31), ('discussion', 30), ('information', 29), ('service', 29), ('asset', 29), ('program', 28), ('gas', 28), ('%', 26), ('year', 26), ('period', 26), ('tax', 26), ('table', 26), ('sale', 25), ('impact', 24), ('interest', 24), ('analysis', 23), ('operation', 22), ('benefit', 22), ('plan', 22), ('CONDITION', 21), ('results', 21), ('pandemic', 21), ('energy', 21), ('facility', 20), ('obligation', 20), ('amount', 19), ('debt', 19), ('payment', 18), ('result', 18), ('recovery', 18), ('decrease', 18), ('depreciation', 18), ('state', 17), ('cash', 17), ('market', 16), ('equity', 16), ('effect', 15), ('generation', 14), ('liability', 14), ('activity', 14), ('expenditure', 14), ('value', 14), ('rating', 14), ('pension', 14), ('business', 13), ('subsidiary', 13), ('employee', 12), ('investment', 12), ('property', 12), ('tracker', 12), ('increase', 12), ('agreement', 12), ('unit', 12), ('party', 11), ('degree', 11), ('assumption', 11), ('safety', 10), ('segment', 10), ('loss', 10), ('goodwill', 10), ('balance', 10), ('heating', 10), ('day', 10), ('growth', 10), ('management', 9), ('company', 9), ('replacement', 9), ('price', 9), ('partner', 9), ('commodity', 9), ('return', 9), ('maintenance', 8), ('time', 8), ('supply', 8), ('material', 8), ('insurance', 8), ('basis', 8), ('amortization', 8), ('impairment', 8), ('usage', 8), ('variance', 8), ('level', 8), ('lease', 8), ('discount', 8), ('process', 7), ('member', 7), ('policy', 7), ...], 'ADJ': [('other', 54), ('regulatory', 41), ('low', 27), ('high', 23), ('additional', 22), ('certain', 21), ('long', 19), ('net', 18), ('normal', 13), ('commercial', 12), ('short', 11), ('significant', 11), ('third', 11), ('warm', 11), ('available', 10), ('such', 10), ('same', 10), ('total', 9), ('common', 9), ('subject', 9), ('-', 8), ('current', 8), ('specific', 8), ('financial', 8), ('industrial', 8), ('future', 8), ('receivable', 7), ('residential', 7), ('further', 7), ('contractual', 7), ('operational', 6), ('ongoing', 6), ('due', 6), ('renewable', 6), ('primary', 6), ('more', 6), ('annual', 6), ('economic', 6), ('actuarial', 6), ('attributable', 5), ('new', 5), ('outside', 5), ('administrative', 5), ('outstanding', 5), ('various', 5), ('postretirement', 5), ('fair', 5), ('corporate', 4), ('non', 4), ('federal', 4), ('equal', 4), ('comparable', 4), ('uncollectible', 4), ('general', 4), ('composite', 4), ('less', 4), ('recent', 4), ('incremental', 4), ('several', 4), ('adequate', 4), ('inclusive', 4), ('actual', 4), ('regulated', 4), ('appropriate', 4), ('periodic', 4), ('initial', 3), ('many', 3), ('second', 3), ('unable', 3), ('pre', 3), ('tax', 3), ('jurisdictional', 3), ('cash', 3), ('intangible', 3), ('natural', 3), ('different', 3), ('first', 3), ('full', 3), ('related', 3), ('preferred', 3), ('joint', 3), ('derivative', 3)], 'ADV': [('primarily', 40), ('approximately', 14), ('partially', 11), ('above', 11), ('also', 10), ('when', 10), ('by', 9), ('more', 6), ('below', 6), ('directly', 5), ('where', 4), ('additionally', 4), ('closely', 4), ('about', 4), ('respectively', 4), ('further', 3), ('currently', 3), ('as', 3), ('well', 3), ('principally', 3), ('therefore', 3), ('prudently', 3), ('most', 3), ('significantly', 3), ('recently', 3)], 'ADP': [('of', 497), ('in', 259), ('to', 228), ('for', 138), ('on', 84), ('by', 64), ('from', 56), ('with', 32), ('at', 26), ('due', 23), ('through', 21), ('during', 14), ('under', 13), ('into', 9), ('over', 6), ('per', 5), ('beyond', 5), ('along', 4), ('below', 4), ('within', 4), ('vs.', 4), ('across', 3), ('until', 3)], 'CONJ': [], 'CCONJ': [('and', 392), ('or', 47), ('n', 21), ('but', 4), ('both', 4)], 'SCONJ': [('as', 55), ('than', 16), ('that', 12), ('if', 9), ('while', 7), ('since', 4), ('upon', 4)]}","{'relate': 44, 'continue': 43, 'offset': 38, 'include': 34, 'operate': 24, 'increase': 23, 'refer': 21, 'compare': 18, 'reflect': 16, 'report': 16, 'decrease': 16, 'expect': 15, 'provide': 15, 'require': 15, 'end': 15, 'result': 15, 'use': 15, 'drive': 15, 'make': 14, 'record': 14, 'see': 13, 'base': 11, 'bill': 10, 'follow': 9, 'purchase': 9, 'determine': 8, 'remain': 8, 'defer': 8, 'place': 7, 'experience': 7, 'account': 7, 'associate': 7, 'calculate': 7, 'retire': 7, 'utilize': 6, 'project': 6, 'allow': 6, 'impact': 6, 'revolve': 6, 'manage': 6, 'pass': 6, 'recover': 6, 'execute': 6, 'estimate': 6, 'define': 5, 'monitor': 5, 'begin': 5, 'regard': 5, 'approve': 5, 'invest': 5, 'present': 5, 'track': 5, 'discuss': 5, 'sell': 5, 'evaluate': 5, 'meet': 5, 'apply': 5, 'design': 4, 'deliver': 4, 'pay': 4, 'take': 4, 'receive': 4, 'help': 4, 'believe': 4, 'reduce': 4, 'incur': 4, 'complete': 4, 'recognize': 4, 'establish': 4, 'comprise': 4, 'hold': 4, 'change': 4, 'depend': 4, 'cool': 4, 'construct': 4, 'issue': 4, 'show': 4, 'expose': 4, 'regulate': 4, 'implement': 3, 'work': 3, 'enter': 3, 'plan': 3, 'enact': 3, 'support': 3, 'anticipate': 3, 'spend': 3, '•': 3, 'occur': 3, 'attribute': 3, 'match': 3, 'indicate': 3, 'represent': 3, 'fire': 3, 'fund': 3, 'describe': 3, 'exceed': 3, 'contain': 3, 'identify': 3, 'involve': 3, ...}",{},"{'rate': 69, 'cost': 66, 'revenue': 48, 'customer': 47, 'expense': 46, 'note': 45, 'operating': 40, 'term': 38, 'income': 33, 'credit': 33, 'risk': 32, 'capital': 32, 'weather': 32, 'change': 31, 'discussion': 30, 'information': 29, 'service': 29, 'asset': 29, 'program': 28, 'gas': 28, '%': 26, 'year': 26, 'period': 26, 'tax': 26, 'table': 26, 'sale': 25, 'impact': 24, 'interest': 24, 'analysis': 23, 'operation': 22, 'benefit': 22, 'plan': 22, 'CONDITION': 21, 'results': 21, 'pandemic': 21, 'energy': 21, 'facility': 20, 'obligation': 20, 'amount': 19, 'debt': 19, 'payment': 18, 'result': 18, 'recovery': 18, 'decrease': 18, 'depreciation': 18, 'state': 17, 'cash': 17, 'market': 16, 'equity': 16, 'effect': 15, 'generation': 14, 'liability': 14, 'activity': 14, 'expenditure': 14, 'value': 14, 'rating': 14, 'pension': 14, 'business': 13, 'subsidiary': 13, 'employee': 12, 'investment': 12, 'property': 12, 'tracker': 12, 'increase': 12, 'agreement': 12, 'unit': 12, 'party': 11, 'degree': 11, 'assumption': 11, 'safety': 10, 'segment': 10, 'loss': 10, 'goodwill': 10, 'balance': 10, 'heating': 10, 'day': 10, 'growth': 10, 'management': 9, 'company': 9, 'replacement': 9, 'price': 9, 'partner': 9, 'commodity': 9, 'return': 9, 'maintenance': 8, 'time': 8, 'supply': 8, 'material': 8, 'insurance': 8, 'basis': 8, 'amortization': 8, 'impairment': 8, 'usage': 8, 'variance': 8, 'level': 8, 'lease': 8, 'discount': 8, 'process': 7, 'member': 7, 'policy': 7, ...}","{'other': 54, 'regulatory': 41, 'low': 27, 'high': 23, 'additional': 22, 'certain': 21, 'long': 19, 'net': 18, 'normal': 13, 'commercial': 12, 'short': 11, 'significant': 11, 'third': 11, 'warm': 11, 'available': 10, 'such': 10, 'same': 10, 'total': 9, 'common': 9, 'subject': 9, '-': 8, 'current': 8, 'specific': 8, 'financial': 8, 'industrial': 8, 'future': 8, 'receivable': 7, 'residential': 7, 'further': 7, 'contractual': 7, 'operational': 6, 'ongoing': 6, 'due': 6, 'renewable': 6, 'primary': 6, 'more': 6, 'annual': 6, 'economic': 6, 'actuarial': 6, 'attributable': 5, 'new': 5, 'outside': 5, 'administrative': 5, 'outstanding': 5, 'various': 5, 'postretirement': 5, 'fair': 5, 'corporate': 4, 'non': 4, 'federal': 4, 'equal': 4, 'comparable': 4, 'uncollectible': 4, 'general': 4, 'composite': 4, 'less': 4, 'recent': 4, 'incremental': 4, 'several': 4, 'adequate': 4, 'inclusive': 4, 'actual': 4, 'regulated': 4, 'appropriate': 4, 'periodic': 4, 'initial': 3, 'many': 3, 'second': 3, 'unable': 3, 'pre': 3, 'tax': 3, 'jurisdictional': 3, 'cash': 3, 'intangible': 3, 'natural': 3, 'different': 3, 'first': 3, 'full': 3, 'related': 3, 'preferred': 3, 'joint': 3, 'derivative': 3}","{'primarily': 40, 'approximately': 14, 'partially': 11, 'above': 11, 'also': 10, 'when': 10, 'by': 9, 'more': 6, 'below': 6, 'directly': 5, 'where': 4, 'additionally': 4, 'closely': 4, 'about': 4, 'respectively': 4, 'further': 3, 'currently': 3, 'as': 3, 'well': 3, 'principally': 3, 'therefore': 3, 'prudently': 3, 'most': 3, 'significantly': 3, 'recently': 3}","{'of': 497, 'in': 259, 'to': 228, 'for': 138, 'on': 84, 'by': 64, 'from': 56, 'with': 32, 'at': 26, 'due': 23, 'through': 21, 'during': 14, 'under': 13, 'into': 9, 'over': 6, 'per': 5, 'beyond': 5, 'along': 4, 'below': 4, 'within': 4, 'vs.': 4, 'across': 3, 'until': 3}",{},"{'and': 392, 'or': 47, 'n': 21, 'but': 4, 'both': 4}","{'as': 55, 'than': 16, 'that': 12, 'if': 9, 'while': 7, 'since': 4, 'upon': 4}","[4, 16, 2, 5, 1, 3, 26, 33, 22, 30, 28, 26, 36, 22, 3, 26, 33, 25, 23, 44, 13, 28, 19, 18, 22, 30, 18, 9, 22, 25, 22, 16, 8, 31, 29, 24, 45, 32, 41, 21, 102, 20, 19, 38, 23, 8, 16, 2, 5, 5, 33, 28, 20, 12, 21, 20, 19, 28, 4, 63, 65, 58, 32, 62, 47, 29, 16, 8, 16, 2, 2, 3, 2, 71, 79, 32, 9, 17, 60, 8, 16, 2, 5, 40, 16, 2, 5, 3, 31, 24, 38, 24, 5, 26, 22, 31, 1, 15, 1, 15, ...]","[Additionally, the decrease to net income available to common shareholders was also impacted by the loss on early extinguishment of debt in 2020 as well as partially offset by a change from income tax expense in 2019 to an income tax benefit in 2020. , In addition, comparability of operation and maintenance expenses, depreciation and amortization, and other taxes may be impacted by regulatory, depreciation and tax trackers that allow for the recovery in rates of certain costs., The change in operating revenues was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating expense, of $273.4 million., This change was primarily driven by: • Loss on sale of the Massachusetts Business of $412.4 million., The change in operating revenues was primarily driven by: •, This change was primarily driven by: • Decreased expenses related to third-party claims and other costs for the Greater Lawrence Incident of $1,090.7 million, net of insurance recoveries recorded., Impacts of the change in methodology will be reflected prospectively and disclosed to the extent it results in notable year-over-year variances in operating revenues., The majority of these amounts were driven by NIPSCO and Columbia of Pennsylvania. , In addition, comparability of operation and maintenance expenses and depreciation and amortization may be impacted by regulatory and depreciation trackers that allow for the recovery in rates of certain costs., The change in operating revenues was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating expense, of $152.1 million., This change was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating revenue, of $152.1 million., The change in operating revenues was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating expense, of $34.8 million., This change was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating revenue, of $34.8 million., Impacts of the change in methodology will be reflected prospectively and disclosed to the extent it results in notable year-over-year variances in operating revenues., This decrease was primarily driven by a year over year increase in net payments related to the Greater Lawrence Incident., The table below reflects capital expenditures and certain other investing activities by segment for 2020, 2019 and 2018. , The 2018 capital expenditures for Gas Distribution Operations reflects reclassifying the Greater Lawrence Incident pipeline replacement from system growth and tracker to maintenance.]","[The decrease in both net income available to common shareholders and operating income during 2020 was primarily due to lower operating revenue related to the sale of the Massachusetts Business, as well as higher operating expenses due to insurance recoveries recorded in 2019, net of third-party claims and other costs, related to the Greater Lawrence Incident., This change is primarily due to the loss on early extinguishment of debt in 2020., These items are offset by increased deferred tax expense recognized on the sale of the Columbia of Massachusetts' regulatory liability, established due to TCJA in 2017, that would have otherwise been recognized over the amortization period, non-cash impairment of goodwill related to Columbia of Massachusetts in 2019 (see Note 7, ""Goodwill and Other Intangible Assets"" for additional information) and one-time adjustments to deferred tax balances., The change in operating revenues was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating expense, of $273.4 million., Lower revenues due to the sale of the Massachusetts Business of $102.2 million., This change was primarily driven by: • Loss on sale of the Massachusetts Business of $412.4 million., Increased expenses primarily due to the impact of the pandemic related to materials and supplies, outside services, and uncollectible expenses of $23.8 million, offset by $12.0 million of deferral of uncollectible and other expenses, net of benefits, related to the pandemic., primarily due to higher capital expenditures placed in service of $24.3 million, Lower operation and maintenance and depreciation and amortization expenses due to the Massachusetts Business sale of $98.7 million., The change in operating revenues was primarily driven by: •, This change was primarily driven by: • Decreased expenses related to third-party claims and other costs for the Greater Lawrence Incident of $1,090.7 million, net of insurance recoveries recorded., Increased depreciation of $103.8 million due to the regulatory outcome of NIPSCO's gas rate case, an increase in amortization of depreciation previously deferred as a regulator asset resulting from Columbia of Ohio's CEP, and higher capital expenditures placed in service., Higher employee and administrative expenses of $50.2 million driven by resources shifting from the temporary assistance on the Greater Lawrence Incident restoration to normal operations (offset in the decreased Greater Lawrence Incident costs discussed above) and an increase in headcount. , Higher property taxes of $22.2 million primarily due to increased amortization of property taxes previously deferred as a regulatory asset resulting from Columbia of Ohio's CEP, as well as higher capital expenditures placed in service., Higher outside services of $17.4 million primarily due to increased line location and safety-related work., In general, we calculate the weather-related revenue variance based on changing customer demand driven by weather variance from normal heating degree days, net of weather normalization mechanisms., The majority of these amounts were driven by NIPSCO and Columbia of Pennsylvania. , Weather in the Gas Distribution Operations service territories for 2019 was about 1% warmer than normal and about 3% warmer than 2018; however, due to the aforementioned change in methodology, the change in operating revenues attributed to weather resulted in an increase of $7.4 million for the year ended December 31, 2019 compared to 2018., This decrease is primarily attributable to warmer weather experienced in 2020 compared to 2019, the sale of the Massachusetts Business and decreased usage by commercial and industrial customers primarily due to the pandemic., The change in operating revenues was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating expense, of $152.1 million., This change was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating revenue, of $152.1 million., Partially offset by: • Increased depreciation of $61.6 million primarily due to additional plant placed in service., Increased expenses primarily due to the impact of pandemic-related materials and supplies, outside services, uncollectible and sequestration expenses of $10.7 million, offset by a $5.3 million deferral of uncollectible and other expenses, related to the pandemic., Increased materials and supplies costs of $4.7 million • Higher insurance expense of $2.7 million primarily driven by increased premiums., The change in operating revenues was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating expense, of $34.8 million., This change was primarily driven by: • Lower cost of energy billed to customers, which is offset in operating revenue, of $34.8 million., Increased depreciation of $8.7 million due to higher capital expenditures placed in service., In general, we calculate the weather-related revenue variance based on changing customer demand driven by weather variance from normal heating or cooling degree days., This decrease was primarily attributable to decreased usage by industrial and commercial customers due to the pandemic and higher self-generation by industrial customers, partially offset by increased usage by residential customers primarily due to the pandemic., As discussed in the ""Executive Summary"", Part I, Item 1A “Risk Factors,” and in Note 20, “Other Commitments and Contingencies” in the Notes to Consolidated Financial Statements, due to the inherent uncertainty of litigation, there can be no assurance that the outcome or resolution of any particular claim related to the Greater Lawrence Incident will not continue to have an adverse impact on our cash flows., This decrease was primarily driven by a year over year increase in net payments related to the Greater Lawrence Incident., In 2020, our typical investing cash outflows were offset by $1,115.9 million of proceeds from the sale of assets, driven by the sale of the Massachusetts Business., Flows primarily due to the capitalized portion of the Corporate Incentive Plan payout, inclusion of capital expenditures included in current liabilities and AFUDC Equity. , This decrease in spending is primarily due to the sale of the Massachusetts Business and impact of COVID 19., This increased spending is primarily due to growth, safety and system modernization projects., Credit risk arises due to the possibility that a customer, supplier or counterparty will not be able or willing to fulfill its obligations on a transaction on or before the settlement date.]"
4,NI,10-K,0001174947-20-000354,0,155108,0,,{},{},{},{},{},{},{},{},{},[],[],[]


In [60]:
def generate_vis(df, pos, top_n):

    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    min_font_size = 10).generate_from_frequencies(df[pos].sum())

    y, x = [], []
    for word,count in df[pos].sum().most_common(top_n):
        y.append(word)
        x.append(count)

    fig, (ax1, ax2) = plt.subplots(1, 2, sharey=False,figsize = (20,10))
    ax1.imshow(wordcloud)
    ax1.set_title(pos+' Cloud')
    ax2 = sns.barplot(x,y)
    ax2.set_title('Top {} {}s'.format(top_n,pos))
    plt.show()


In [254]:
causal_verbs = ['drive','reflect','affect','impact','cause']
causal_phrases = ['due to', 'driven by']

# expectation_verbs = ['expect','believe','forecast']

In [142]:
print('OVERVIEW of Sample Statistics')
print("MDA sample size = ",len(df))
print("Num of unique companies = ",len(df.ticker.unique()))
print("Num of 10K = {} ({:.1f}% of total)".format(sum(df['type']=='10-K'),sum(df['type']=='10-K')/len(df)*100))
print("Num of 10Q = {} ({:.1f}% of total)".format(sum(df['type']=='10-Q'),sum(df['type']=='10-Q')/len(df)*100))
print("Num of MD&A identified ={} ({:.1f}% of total)".format(sum(df['sent_len'].apply(len)!=0),sum(df['sent_len'].apply(len)!=0)/len(df)*100))
print("Num of sentences identified:")
print(" - Containing causal verbs {} = {}".format(causal_verbs,sum(df['causal_verbs'].apply(len))))
print(" - Containing causal phrases {} = {}".format(causal_phrases,sum(df['causal_phrases'].apply(len))))


OVERVIEW of Sample Statistics
MDA sample size =  738
Num of unique companies =  10
Num of 10K = 174 (23.6% of total)
Num of 10Q = 564 (76.4% of total)
Num of MD&A identified =676 (91.6% of total)
Num of sentences identified:
 - Containing causal verbs ['drive', 'reflect', 'affect', 'impact', 'cause'] = 10601
 - Containing causal phrases ['due to', 'driven by'] = 19779


**Obervations and Next Steps:**
- Probably need to recall 2 consecutive sentences in order to perform Co-reference Resolution
- Sentence segmentation is currently done via SpaCy's default doc.sents - probably need to modify/adapt 
- Need to consider how to approach bullet points
- Need to enrich the causal_verbs and causal_phrases
- Need to think about how to extract key phrases {cause} and {effect} from identified sentences
- Need to filter off the overlapping sentences i.e. containing both causal_verbs and causal phrases
    

In [54]:
# Extracting Noun Phrases --> issue: noun phrases do not always capture the full range of cause / effect spans

sentences = df.iloc[0]['causal_verbs']
for s in sentences[:2]:
    print('\n\n >>> ',s,'\n')
    doc = nlp(s)
    for sent in doc.sents:
        print('Root Verb: ',sent.root.text,sent.root.pos_,sent.root.dep_)
        for chunk in sent.noun_chunks:
            if (chunk.root.dep_ in ['nsubj','nsubjpass','dobj'] and chunk.root.head == sent.root):
                print(chunk.root.dep_,'->',chunk.text, '\n')
            else:
                print('......',chunk.text, '->',chunk.root.pos_,chunk.root.dep_,'->',chunk.root.head.text,'->',chunk.root.head.pos_,chunk.root.head.dep_,'\n')
        break



 >>>  COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients, as well as social distancing measures have negatively affected our results.   

Root Verb:  affected VERB ROOT
nsubj -> COVID-19-related disruptions 

...... patients’ inability -> NOUN pobj -> including -> VERB prep 

...... health care providers -> NOUN dobj -> access -> VERB acl 

...... prioritization -> NOUN appos -> disruptions -> NOUN nsubj 

...... COVID-19 patients -> NOUN pobj -> of -> ADP prep 

...... social distancing measures -> NOUN conj -> prioritization -> NOUN appos 

dobj -> our results 



 >>>  The decline during the first six months of 2021 primarily reflects decreases across markets due to ongoing generic competition for products within the established brands business, particularly for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin), lower sales of respiratory products Singulair (montelukast), Dulera and N

In [91]:
# Need to capture the full phrase, not just the noun_chunk identified by spacy

print("\n>>> {}\n...VB root = '{}' \n...left = '{}' \n...right = '{}'".format(sent,sent.root,sent[:sent.root.i],sent[sent.root.i+1:]))

j = sent.root.i+1
for punk in sent[sent.root.i+1:]:
    if punk.is_punct and not (punk.is_left_punct or punk.is_right_punct):
        print("......RIGHT puncts: '{}'".format(sent[j:punk.i]))
        j = punk.i+1

#for np in sent[:sent.root.i].noun_chunks:
#    print("......LEFT NP chunk: '{}' -> head '{}'".format(np.text,np.root.head.text))
    
#for np in sent[sent.root.i+1:].noun_chunks:
#    print("......RIGHT NP chunk: '{}' -> head '{}'".format(np.text,np.root.head.text))

#for token in sent[sent.root.i+1:]:
#    if token.pos_ == 'VERB':
#        print("......RIGHT verbs: '{}' -> head '{}'".format(token.text,token.head))



>>> The decline during the first six months of 2021 primarily reflects decreases across markets due to ongoing generic competition for products within the established brands business, particularly for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin), lower sales of respiratory products Singulair (montelukast), Dulera and Nasonex, and generic competition for women’s health product
...VB root = 'reflects' 
...left = 'The decline during the first six months of 2021 primarily' 
...right = 'decreases across markets due to ongoing generic competition for products within the established brands business, particularly for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin), lower sales of respiratory products Singulair (montelukast), Dulera and Nasonex, and generic competition for women’s health product'
......RIGHT puncts: 'decreases across markets due to ongoing generic competition for products within the established brands business'
......RIGHT puncts:

In [195]:

# Make use of punctuations to get sentence segments

def decompose(sentence):
    
    sent = nlp(sentence)
    
    root = sent[:].root
    lefts = []
    rights = []
    
    j = 0
    for punk in sent[:root.i]:
        if punk.is_punct and not (punk.is_left_punct or punk.is_right_punct):
            #print("......LEFT puncts: '{}'".format(sent[j:punk.i]))
            lefts.append(sent[j:punk.i])
            j = punk.i+1
    if len(lefts)==0:
        lefts.append(sent[0:punk.i])
    
    j = root.i+1
    for punk in sent[root.i+1:]:
        if punk.is_punct and not (punk.is_left_punct or punk.is_right_punct):
            #print("......RIGHT puncts: '{}'".format(sent[j:punk.i]))
            rights.append(sent[j:punk.i])
            j = punk.i+1
    if not (root.right_edge.is_punct or root.right_edge.is_space):
        #print("......RIGHT puncts: '{}'".format(sent[j:]))
        rights.append(sent[j:])
    #print('\n\n\n')
    
    return lefts, root.lemma_, rights

# first attempt at subtree, not ideal

def get_subtrees(sentence):
    sent = nlp(sentence)
    root = sent[:].root
    for token in sent[:root.i]:
        if token.dep_ == 'nsubj':
            voice = 'active'
            left_subtree = [*token.subtree]
        if token.dep_ == 'nsubjpass':
            voice = 'passive'
            left_subtree = [*token.subtree]
    
    left_subtree_ = sent[:root.i]

    right_subtree = sent[root.i+1:]
    return voice, left_subtree,left_subtree_, root, right_subtree


In [247]:
# check if contains topics of interest

topics = ['revenue','revenues', 'sales','cost','costs','margin','profit','net income','our results','our business','growth','decline']

from spacy.matcher import PhraseMatcher

def match(sentence, topics):
    if type(sentence) is spacy.tokens.doc.Doc:
        doc = sentence
    else:
        doc = nlp(sentence)
    patterns = [nlp(text) for text in topics]
    phrase_matcher = PhraseMatcher(nlp.vocab,attr="LOWER")
    phrase_matcher.add('topics', None, *patterns)
    matched_phrases = phrase_matcher(doc)
    if (len(matched_phrases)>0):
        return True

In [197]:

row = df.iloc[0]
i = 1
for sentence in row['causal_verbs']:
    if match(sentence, topics):
        lefts, root, rights = decompose(sentence) 
        print(i,'>>> ROOT={}\n\tLEFT={}\n\tRIGHT={}\n'.format(root, lefts, rights))
        i+=1
        voice, left_tree, left_tree_, root, right_tree = get_subtrees(sentence)
        print('VOICE={}, root={} \n left_subtree={}\n left_subtree_={}\n right_subtree={}\n'.format(voice, root,left_tree,left_tree_,right_tree))
        
    else:
        print(' ...... ',sentence)
print(len(row['causal_verbs']))


1 >>> ROOT=affect
	LEFT=[COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients]
	RIGHT=[our results]

VOICE=active, root=affected 
 left_subtree=[COVID-19-related, disruptions, ,, including, patients, ’, inability, to, access, health, care, providers, ,, prioritization, of, COVID-19, patients, ,, as, well, as, social, distancing, measures]
 left_subtree_=COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients, as well as social distancing measures have negatively
 right_subtree=our results.  

2 >>> ROOT=reflect
	LEFT=[The decline during the first six months of 2021]
	RIGHT=[decreases across markets due to ongoing generic competition for products within the established brands business, particularly for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin), lower sales of respiratory products Singulair (montelukast), Dulera a

12 >>> ROOT=reflect
	LEFT=[The change in cash used in financing]
	RIGHT=[the proceeds from the issuance of long term debt, the payment of related debt issuance costs and the settlement of  the transactions with Merck in connection with the Separation (see Note 17 to our Condensed Consolidated Financial Statements)]

VOICE=active, root=reflects 
 left_subtree=[The, change, in, cash, used, in, financing, activities]
 left_subtree_=The change in cash used in financing activities
 right_subtree=the proceeds from the issuance of long term debt, the payment of related debt issuance costs and the settlement of  the transactions with Merck in connection with the Separation (see Note 17 to our Condensed Consolidated Financial Statements).

20


In [229]:
#sent = nlp('COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients, as well as social distancing measures have negatively affected our results.')
#sent = nlp('Cash provided by operating activities was favorably impacted by an increase in accounts payable, including amounts due to Merck, partially offset by a decline in net income.')
#sent = nlp('Accordingly, the historical results of operations of the Merck Retained Products have been reflected as discontinued operations in the Condensed Consolidated Financial Statements for all periods presented')
sent = nlp('Sales for the first six months of 2021 also reflect lower demand in Europe and Canada in the beginning of 2021 due to the COVID-19 pandemic.')

from spacy import displacy
#displacy.render(sent, style='dep')

#for t in sent:
#    print(t,'-->',t.dep_,'--head: ',t.head.text)

    
root = sent[:].root

advmod = []
prep=[]
for child in root.children:
    
    # active voice
    if child.dep_ == 'nsubj':
        voice = 'active'
        left_subtree = sent[child.left_edge.i:child.right_edge.i+1] 
    if child.dep_ == 'dobj':
        right_subtree = sent[child.left_edge.i:child.right_edge.i+1]
    
    # passive voice
    if child.dep_ == 'nsubjpass':
        voice = 'passive'
        left_subtree = sent[child.left_edge.i:child.right_edge.i+1]    
    if child.dep_ == 'agent':
        right_subtree = sent[child.left_edge.i+1:child.right_edge.i+1]

    # other modifiers    
    if child.dep_ == 'advmod':
        advmod.append(sent[child.left_edge.i:child.right_edge.i+1])
    if child.dep_ == 'prep':
        prep.append(sent[child.left_edge.i:child.right_edge.i+1])
        
print(' voice={}\t root={}\t advmod={}\n left_subtree={}\n right_subtree={}\n prep={}\n'.format(voice, root, advmod,left_subtree,right_subtree,prep)) 


due due to the COVID-19 pandemic
 voice=active	 root=reflect	 advmod=[also]
 left_subtree=Sales for the first six months of 2021
 right_subtree=lower demand in Europe and Canada in the beginning of 2021
 prep=[due to the COVID-19 pandemic]



In [80]:
def extract_verb(sent):
    """extract verb phrase from a sentence
    input: Doc object from nlp(sentence)
    return: verb.lemma_ , verb phrase in the original sentence (including auxilary and proposition)
    """
    verb = ''
    verb_chunk = []

    root = sent[:].root

    if root.pos_ in ['VERB','AUX']:
        verb = sent[:].root.lemma_

        for token in sent:
            if (token.pos_ not in ['NOUN','ADV','PUNCT']) and (token.head == root or token == root):
                verb_chunk.append(token)
            if (token.pos_ in ['ADP']) and (token.head in root.children) and (token.i > root.i):
                verb_chunk.append(token)
    
    return verb, verb_chunk



def extract_subject(sent):
    subj = ''
    subj_chunk = []
    connectors = []
    
    root = sent[:].root
    
    for child in root.lefts:
        if child.dep_ in ['nsubj','nsubjpass']:
            subj = child
    
            for np in sent[:root.i].noun_chunks:
                subj_chunk.append(np)
                if np.start>1:
                    connectors.append(sent[np.start-1])
            
            break
            
    return subj, subj_chunk, connectors




def extract_object(sent, verb_chunk):

    obj = ''
    obj_chunk=[]
    connectors = []
        
    for child in sent[verb_chunk[-1].i+1:]: #verb_chunk[-1].rights: #due to - child of due not to
        
        if child.dep_ in ['obj','dobj','iobj']: #, 'pobj']:
            obj = child
            break
    
    for np in sent[verb_chunk[-1].i+1:].noun_chunks:
        obj_chunk.append(np)
        if np.start > verb_chunk[-1].i+1:
            connectors.append(sent[np.start-1])
    
    if obj is '' and len(obj_chunk)>0:
        obj = obj_chunk[0].root
        
    return obj, obj_chunk, connectors



def break_down(sentence):
    """breaking a sentence down into the following components: 
    - voice (active or passive), 
    - subject (noun, noun-phrases, connectors between noun-phrases), 
    - predicate (verb, verb-chunks), 
    - object (noun, noun-phrases, connectors between noun-phrases).
    
    input: string or spacy.tokens.doc.Doc
    output: dictionary1 (key: [list of position indices]), dictionary2 (key: [list of raw texts])
    
    Rather than using the above individual functions and looping over a sentence multiple-time, this function only loop once 
    """

    if type(sentence) is spacy.tokens.doc.Doc:
        sent = sentence
    else:
        sent = nlp(sentence)
    
    d_ix = {}
    root = sent[:].root
        
    voice = []
    verb = []
    verb_chunk = []
    subj = []
    subj_chunk = []
    subj_connectors = []
    obj = []
    obj_chunk=[]
    obj_connectors = []
    
    if root.pos_ in ['VERB','AUX']:
        verb.append(sent[:].root.i)
        for token in sent:
            if (token.pos_ not in ['NOUN','ADV','PUNCT']) and (token.head == root or token == root):
                verb_chunk.append(token.i)
            if (token.pos_ in ['ADP']) and (token.head in root.children) and (token.i > root.i):
                verb_chunk.append(token.i)

            if token.dep_ =='nsubj': 
                voice.append('active')
                subj.append(token.i) 
            if token.dep_ =='nsubjpass':
                voice.append('passive')
                subj.append(token.i)
            
            if token.dep_ in ['obj','dobj','iobj']: #, 'pobj']:
                obj.append(token.i)

        for np in sent[:root.i].noun_chunks:
            subj_chunk.append((np.start,np.end))
            if np.start>1:
                subj_connectors.append(np.start-1)

        for np in sent[verb_chunk[-1]+1:].noun_chunks:
            obj_chunk.append((np.start,np.end))
            if np.start > verb_chunk[-1]+1:
                obj_connectors.append(np.start-1)
    
        if obj ==[] and len(obj_chunk)>0:
            np1 = sent[obj_chunk[0][0]]
            obj.append(np1.head.i) 


    d_ix = {'voice':voice, 'verb': verb, 'verb_chunk':verb_chunk, 
         'subj':subj, 'subj_chunk':subj_chunk,'subj_connectors':subj_connectors,
         'obj':obj, 'obj_chunk':obj_chunk,'obj_connectors':obj_connectors}
    
    d_text = {}
    for k,v in d_ix.items():
        d_text[k] = []
        
        if len(v) == 0 or type(v[0]) is str:
            d_text[k] = v
        
        elif type(v[0]) is int:
            for x in v:
                d_text[k].append(sent[x].text)
        
        elif type(v[0]) is tuple:
            for x in v:
                d_text[k].append(sent[x[0]:x[1]].text)
        
    return d_ix, d_text




In [81]:
s1 = 'Total sales decreased 22% primarily attributable to the impact of Covid-19.' # "Singulair sales in the first six months of 2021 decreased 22% primarily attributable to the impact of VBP in China, lower volume in Japan due to generic competition as well as the timing of shipments, and ongoing impact of the COVID-19 pandemic in the Asia Pacific region."
s2 =  'The increase is primarily due to higher sales of health products.' #"The increase is primarily due to higher sales of women’s health products, including Nexplanon, Implanon and NXT."
s3 =  'the overall increase in working capital was driven by cash funding for new factory.' #"The overall increase in working capital of continuing operations was primarily driven by cash funding by Merck in connection with the Separation, offset by an increase in current liabilities with Merck primarily for inventory purchases, as well as increases in employee benefits and payroll."
s4 = 'The typhoon caused many deaths.'
for s in [s1,s2,s3,s4]:
    print('\n\n',s) 
#    for token in nlp(s):
#        print(token,'...',token.pos_,'...',token.dep_,'...',token.head)
    
#    verb, verb_chunk = extract_verb(nlp(s))
#    print(verb_chunk[-1], [*verb_chunk[-1].rights])
#    print('\n>>>', extract_verb(nlp(s)))
#    print('\n>>>', extract_subject(nlp(s)))
#    print('\n>>>', extract_object(nlp(s), verb_chunk))
    
    print('\n\n',break_down(nlp(s)))
        



 Total sales decreased 22% primarily attributable to the impact of Covid-19.


 ({'voice': ['active'], 'verb': [2], 'verb_chunk': [2, 6, 7], 'subj': [1], 'subj_chunk': [(0, 2)], 'subj_connectors': [], 'obj': [9], 'obj_chunk': [(8, 10), (11, 12)], 'obj_connectors': [10]}, {'voice': ['active'], 'verb': ['decreased'], 'verb_chunk': ['decreased', 'attributable', 'to'], 'subj': ['sales'], 'subj_chunk': ['Total sales'], 'subj_connectors': [], 'obj': ['impact'], 'obj_chunk': ['the impact', 'Covid-19'], 'obj_connectors': ['of']})


 The increase is primarily due to higher sales of health products.


 ({'voice': ['active'], 'verb': [2], 'verb_chunk': [2, 4, 5], 'subj': [1], 'subj_chunk': [(0, 2)], 'subj_connectors': [], 'obj': [7], 'obj_chunk': [(6, 8), (9, 11)], 'obj_connectors': [8]}, {'voice': ['active'], 'verb': ['is'], 'verb_chunk': ['is', 'due', 'to'], 'subj': ['increase'], 'subj_chunk': ['The increase'], 'subj_connectors': [], 'obj': ['sales'], 'obj_chunk': ['higher sales', 'health pro

In [59]:
# Need to group verbs according to pattern

def break_down(sentence):
    """breaking a sentence down into components
    input: string
    output: a dict consisting of (voice, root, advmod, subject, object, prep)
    """
    #sent = nlp(sentence)
    if type(sentence) is spacy.tokens.doc.Doc:
        sent = sentence
    else:
        sent = nlp(sentence)
        
    
    if sent[:].root.pos_ not in ['VERB','AUX']:
        return 
    # skip non-sentences 
    else:
        root = sent[:].root
    
    d = {}
    d['advmod'] = []
#    d['prep'] = []
    d['root'] = root
    
    
    d['verb_chunk'] = []
    for token in sent:
        if (token.pos_ not in ['NOUN','PUNCT']) and (token.head == root or token == root):
            d['verb_chunk'].append(token)
    
    
    d['voice'] = ''
    
    for child in root.children:

        # active voice
        if child.dep_ == 'nsubj':
            d['voice'] = 'active'
            d['left'] = sent[:child.right_edge.i+1] 
        if child.dep_ == 'dobj':
            d['right'] = sent[child.left_edge.i:]

        # passive voice
        if child.dep_ == 'nsubjpass':
            d['voice'] = 'passive'
            d['left'] = sent[:child.right_edge.i+1]    
        if child.dep_ == 'agent':
            d['right'] = sent[child.left_edge.i+1:]

        # other modifiers    
        if child.dep_ == 'advmod':
            d['advmod'].append(sent[child.left_edge.i:child.right_edge.i+1])
        if child.dep_ == 'neg' and child.i == root.i - 1:
            d['advmod'].append(sent[child.left_edge.i:child.right_edge.i+1])

    if 'left' not in d.keys():
        d['left'] = sent[:root.i]
    if 'right' not in d.keys():
        d['right'] = sent[root.i+1:]
        
    return d




def get_cause(d, CvE,EvC):
    
    if d['root'].lemma_ in EvC:
        if d['voice'] == 'active':
            return d['right']
        else:
            return d['left']
    if d['root'].lemma_ in CvE:
        if d['voice'] == 'active':
            return d['left']
        else:
            return d['right']

def get_effect(d, CvE,EvC):
    
    if d['root'].lemma_ in CvE:
        if d['voice'] == 'active':
            return d['right']
        else:
            return d['left']
    if d['root'].lemma_ in EvC:
        if d['voice'] == 'active':
            return d['left']
        else:
            return d['right']
        

In [303]:
row = df.iloc[0]

i = 1
for sentence in row['causal_verbs']:
    
    print(' ...... ',sentence,'\n')
    
    d = break_down(sentence)
    if d:
        e = get_effect(d,CvE,EvC)
    
        if match(e.as_doc(), topics):
            c = get_cause(d,CvE,EvC)
            print(i,'>>> VOICE={} ROOT={} advmod={}\n\tCAUSE={}\n\tEFFECT={}\n\n\n'.format(d['voice'],d['root'],d['advmod'], c, e))
            i+=1
    
print(len(row['causal_verbs']), 'sentences containing causal_verbs -> filter ->', i,'sentences containing relevant topics')


 ......  COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients, as well as social distancing measures have negatively affected our results.   

1 >>> VOICE=active ROOT=affected advmod=[negatively]
	CAUSE=COVID-19-related disruptions, including patients’ inability to access health care providers, prioritization of COVID-19 patients, as well as social distancing measures
	EFFECT=our results.  



 ......  The decline during the first six months of 2021 primarily reflects decreases across markets due to ongoing generic competition for products within the established brands business, particularly for cardiovascular products Zetia and Vytorin (ezetimibe and simvastatin), lower sales of respiratory products Singulair (montelukast), Dulera and Nasonex, and generic competition for women’s health product 

2 >>> VOICE=active ROOT=reflects advmod=[primarily]
	CAUSE=decreases across markets due to ongoing generic competiti

In [None]:
# Reconsidering data structure - each sentence in a row, sentences from the same document share the same id

# verbs seed -> casuse-effect candidate pool -> find more connective verbs


In [86]:
def get_MDA(row):
    
    global data_root
    
    ticker = str(row['ticker'])
    type_= str(row['type'])
    file = str(row['file'])   
    
    folder = data_root + 'Samples/' + ticker + "/" + type_ + "/" + file + "/"
    file_name = folder + "/raw_mda.txt"
    
    try:
        with open(file_name) as f:
            text = f.read()
    except OSError:
        text = ''
    
    return text


def get_sentences(text):
    
    doc = nlp(text)
    sentences = [s for s in doc.sents]
    
    return sentences




def filter_sentences(sentence, causal_verbs, phrase_matcher):
    verb = '' #sentence.root.lemma_
    left = '' #sentence[:sentence.root.i].text
    right = '' #sentence[sentence.root.i+1:].text
    voice = ''
    subj = ''
    obj = ''
    cause= ''
    effect= ''
    causal_verb = ''
    causal_phrase = ''
    
    for child in sentence.root.children:
        if child.dep_ in ['nsubj', 'nsubjpass']:
            subj = child
            
        if child.dep_ in ['obj','dobj','iobj']:
            obj = child
        # object: passive - by (agent) + obj (prop)
    
    d = break_down(sentence.text) #keys: voice, root, left, right
    if d:
        verb = d['root']
        voice = d['voice']
        left = d['left']
        right = d['right']

        if verb.text in causal_verbs: 
            causal_verb =  True

            cause = get_cause(d, CvE,EvC)
            effect = get_effect(d, CvE,EvC)
        
        # TODO: optimize this code...
        matched_phrases = phrase_matcher(nlp(right.text)) 
        if len(matched_phrases) > 0:
            causal_phrase = right[matched_phrases[0][1]:matched_phrases[0][2]]
            cause = right[matched_phrases[0][2]:]
            effect = left

        matched_phrases = phrase_matcher(nlp(left.text))
        if len(matched_phrases) > 0:
            causal_phrase = left[matched_phrases[0][1]:matched_phrases[0][2]]
            cause = left[matched_phrases[0][2]:]
            effect = right #subj+verb+obj

    return voice, subj, verb, obj, causal_verb, causal_phrase, cause, effect 


    

def transfer_sentences(df_sent,df_doc):

    for index, row in df_doc.iterrows():

        ticker = str(row['ticker'])
        type_ = str(row['type'])
        file = str(row['file'])
        id_ = (ticker,type_,file) #unique id for each document

        text = get_MDA(row)  # raw text
        sentences = get_sentences(text)  # SpaCy doc
        
        df_ = pd.DataFrame()
        df_['sentence'] = sentences
        df_['temp'] = df_['sentence'].apply(lambda sentence : filter_sentences(sentence, causal_verbs, phrase_matcher))
        df_['voice'] = df_['temp'].apply(lambda x:x[0])
        df_['subj'] = df_['temp'].apply(lambda x:x[1])
        df_['verb'] = df_['temp'].apply(lambda x:x[2])
        df_['obj'] = df_['temp'].apply(lambda x:x[3])
        df_['causal_verb'] = df_['temp'].apply(lambda x:x[4])
        df_['causal_phrase'] = df_['temp'].apply(lambda x:x[5])
        df_['cause'] = df_['temp'].apply(lambda x:x[6])
        df_['effect'] = df_['temp'].apply(lambda x:x[7])
        
        df_ = df_[df_['voice']!=''] # removed non-sentences (i.e. there is no 'nsubj' or 'nsubjpass')
        df_['position'] = [i for i in range(len(df_))]
        df_['id'] = [id_]*len(df_)
        
        df_.drop(columns='temp', inplace=True)
        df_sent = df_sent.append(df_, ignore_index=True)

        print(id_, 'num of sentences:', len(sentences),'->',len(df_))

    return df_sent


causal_verbs = ['drive','reflect','affect','impact','cause']
CvE = ['drive','affect','impact','cause','result']
EvC = ['reflect']

from spacy.matcher import PhraseMatcher
causal_phrases = ['due to', 'attributable to', 'driven by', 'impacted by','offset by','as a result of'] # in clauses, not main verb
patterns = [nlp(text) for text in causal_phrases]
phrase_matcher = PhraseMatcher(nlp.vocab,attr="LOWER")
phrase_matcher.add('causal_phrases', None, *patterns)

#matched_phrases = phrase_matcher(doc)


In [87]:
data_root = "C:/Users/clair/Desktop/Thesis/masterThesis2022/Data/"
df_doc = pd.read_pickle(data_root+'/sample2_mda_wordcount.pkl')

#df_sent = pd.DataFrame(columns = ['id','position','sentence','verb','voice','left','right'])

df_sent = pd.DataFrame()
df_sent = transfer_sentences(df_sent,df_doc.loc[0:3])

print(len(df_sent))


('OGN', '10-Q', '0001821825-21-000009') num of sentences: 220 -> 139
('OGN', '10-Q', '0001821825-21-000005') num of sentences: 161 -> 99
('NI', '10-K', '0001174947-21-000255') num of sentences: 0 -> 0
('NI', '10-K', '0001111711-21-000010') num of sentences: 585 -> 226
464


In [88]:
df_sent

Unnamed: 0,sentence,voice,subj,verb,obj,causal_verb,causal_phrase,cause,effect,position,id
0,"(We, make, statements, in, this, report, ,, and, we, may, from, time, to, time, make, other, written, reports, and, oral, statements, ,, regarding, our, outlook, or, expectations, for, financial, ,, business, or, strategic, matters, regarding, or, affecting, us)",active,We,make,statements,,,,,0.0,"(OGN, 10-Q, 0001821825-21-000009)"
1,"(One, can, identify, these, forward, -, looking, statements, by, their, use, of, words, such, as, “, anticipates, ,, ”, “, expects, ,, ”, “, plans, ,, ”, “, will, ,, ”, “, estimates, ,, ”, “, forecasts, ,, ”, “, projects, ”, and, other, words, of, similar, meaning, ,, or, negative, variations, of, any, of, the, foregoing, .)",active,One,identify,statements,,,,,1.0,"(OGN, 10-Q, 0001821825-21-000009)"
2,"(One, can, also, identify, them, by, the, fact, that, they, do, not, relate, strictly, to, historical, or, current, facts, .)",active,One,identify,them,,,,,2.0,"(OGN, 10-Q, 0001821825-21-000009)"
3,"(Such, forward, -, looking, statements, include, ,, but, are, not, limited, to, ,, statements, relating, to, the, Company, ’s, growth, strategy, ,, financial, results, ,, product, development, ,, product, approvals, ,, product, potential, and, development, programs, .)",active,statements,include,statements,,,,,3.0,"(OGN, 10-Q, 0001821825-21-000009)"
4,"(One, must, carefully, consider, any, such, statement, and, should, understand, that, many, factors, could, cause, actual, results, to, differ, materially, from, the, Company, ’s, forward, -, looking, statements, .)",active,One,consider,statement,,,,,4.0,"(OGN, 10-Q, 0001821825-21-000009)"
5,"(These, factors, include, inaccurate, assumptions, and, a, broad, variety, of, other, risks, and, uncertainties, ,, including, the, impact, of, the, global, outbreak, of, COVID-19, and, other, risks, and, uncertainties, some, that, are, known, and, some, that, are, not, .)",active,factors,include,assumptions,,,,,5.0,"(OGN, 10-Q, 0001821825-21-000009)"
6,"(No, forward, -, looking, statement, can, be, guaranteed, and, actual, future, results, may, vary, materially, .)",passive,statement,guaranteed,,,,,,6.0,"(OGN, 10-Q, 0001821825-21-000009)"
7,"(The, factors, described, in, Part, II, .)",active,factors,described,,,,,,7.0,"(OGN, 10-Q, 0001821825-21-000009)"
8,"(Item, 1A., Risk, Factors, of, this, report, or, otherwise, described, in, our, filings, with, the, SEC, ,, provide, examples, of, risks, ,, uncertainties, and, events, that, may, cause, our, actual, results, to, differ, materially, from, the, expectations, expressed, in, our, forward, -, looking, statements, ,, including, ,, but, not, limited, to, :, •, difficulties, in, operating, as, an, independent, company, ;, •, costs, and, temporary, business, interruptions, related, to, the, Separation, ;, •, competition, from, generic, and, /or, biosimilar, products, as, our, products, lose, patent, protection, ;, •, expanded, competition, in, the, women, 's, health, market, ;, •, difficulties, with, ...)",active,Factors,provide,competition,,,,,8.0,"(OGN, 10-Q, 0001821825-21-000009)"
9,"(Consequently, ,, the, reader, should, not, consider, the, above, list, or, any, other, such, list, to, be, a, complete, statement, of, all, potential, risks, or, uncertainties, .)",active,reader,consider,list,,,,,9.0,"(OGN, 10-Q, 0001821825-21-000009)"


In [90]:
s1 = 'Total sales decreased 22% primarily attributable to the impact of Covid-19.' # "Singulair sales in the first six months of 2021 decreased 22% primarily attributable to the impact of VBP in China, lower volume in Japan due to generic competition as well as the timing of shipments, and ongoing impact of the COVID-19 pandemic in the Asia Pacific region."
s2 =  'The increase is primarily due to higher sales of health products.' #"The increase is primarily due to higher sales of women’s health products, including Nexplanon, Implanon and NXT."
s3 =  'the overall increase in working capital was driven by cash funding for new factory.' #"The overall increase in working capital of continuing operations was primarily driven by cash funding by Merck in connection with the Separation, offset by an increase in current liabilities with Merck primarily for inventory purchases, as well as increases in employee benefits and payroll."

In [128]:
for s in [s1,s2,s3]:
    doc = nlp(s)
    print('\n\n',doc,'\n')
    subj = 'NONE'
    obj = 'NONE'
    
    for token in doc:
        print(token,'...',token.dep_,'...',token.head)
        if token.dep_ == 'nsubj' or token.dep_ == 'nsubjpass':
            subj = token
        if token.dep_ in ['obj','dobj','iobj']:
            obj = token
        verb = doc[:].root
    print('\n>>>',subj,verb,obj,'\n')
    

    for child in verb.lefts:
        if child.head == verb:
            #print(child)
            for nounphrase in doc[:doc[:].root.i].noun_chunks:
                if nounphrase.root == child:
                    print(nounphrase)
                        
    print(verb)
    for child in verb.rights:
        if child.head == verb:
            print(child,[*child.children])
            for nounphrase in doc[doc[:].root.i+1:].noun_chunks:
                #print(nounphrase.root,nounphrase.root.head)
                if nounphrase.root.head in [*child.children]:
                    print(nounphrase)
    



 Total sales decreased 22% primarily attributable to the impact of Covid-19. 

Total ... amod ... sales
sales ... nsubj ... decreased
decreased ... ROOT ... decreased
22 ... nummod ... %
% ... npadvmod ... decreased
primarily ... advmod ... attributable
attributable ... advmod ... decreased
to ... prep ... attributable
the ... det ... impact
impact ... pobj ... to
of ... prep ... impact
Covid-19 ... pobj ... of
. ... punct ... decreased

>>> sales decreased NONE 

Total sales
decreased
% [22]
attributable [primarily, to]
the impact
. []


 The increase is primarily due to higher sales of health products. 

The ... det ... increase
increase ... nsubj ... is
is ... ROOT ... is
primarily ... advmod ... is
due ... prep ... is
to ... pcomp ... due
higher ... amod ... sales
sales ... pobj ... due
of ... prep ... sales
health ... compound ... products
products ... pobj ... of
. ... punct ... is

>>> increase is NONE 

The increase
is
primarily []
due [to, sales]
. []


 the overall increase