# NLP on Financial Statements Part 1

In [1]:
# import sys
# !{sys.executable} -m pip install cloudpickle alphalens nltk numpy ratelimit requests scikit-learn six tqdm

In [2]:
# !pip install tqdm
# !pip install ratelimit
# conda install -c cvxgrp cvxpy
# conda install -c conda-forge cycler
# conda install -c plotly plotly
# conda install -c anaconda pyparsing
# conda install -c anaconda pytz
# conda install scikit-learn #python 3.7.0 and rolls back conda install
# pip install zipline

### Load Packages

In [3]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
from tqdm import tqdm
import matplotlib.pyplot as plt
import requests
import ratelimit
from ratelimit import limits, sleep_and_retry
import re
import requests

### Download NLP Corpora
The stopwords corpus for removing stopwords and wordnet for lemmatizing.

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Starter Code

In [5]:
class SecAPI(object):
    SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}
    @staticmethod
    @sleep_and_retry
    # Dividing the call limit by half to avoid coming close to the limit
    @limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
    def _call_sec(url):
        return requests.get(url)

    def get(self, url):
        return self._call_sec(url).text

In [6]:
def print_twenty_F_data(twenty_F_data, fields, field_length_limit=50):
    indentation = '  '

    print('[')
    for twenty_F in twenty_F_data:
        print_statement = '{}{{'.format(indentation)
        for field in fields:
            value = str(twenty_F[field])

            # Show return lines in output
            if isinstance(value, str):
                value_str = '\'{}\''.format(value.replace('\n', '\\n'))
            else:
                value_str = str(value)

            # Cut off the string if it gets too long
            if len(value_str) > field_length_limit:
                value_str = value_str[:field_length_limit] + '...'

            print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)

        print_statement += '},'
        print(print_statement)
    print(']')

In [7]:
def plot_similarities(similarities_list, dates, title, labels):
    assert len(similarities_list) == len(labels)

    plt.figure(1, figsize=(10, 7))
    for similarities, label in zip(similarities_list, labels):
        plt.title(title)
        plt.plot(dates, similarities, label=label)
        plt.legend()
        plt.xticks(rotation=90)

    plt.show()

## Get 20-Fs
We'll be running NLP analysis on 20-F documents. To do that, we first need to download the documents. For this project, we'll download 20-Fs for a few companies. To lookup documents for these companies, we'll use their CIK.

In [8]:
tickerList_df = pd.read_csv('OTC.csv')
tickerList = tickerList_df['Ticker'].tolist()

In [9]:
def getCIKs(TICKERS):
    URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    CIK_RE = re.compile(r'.*CIK=(\d{10}).*')
    cik_dict = {}
    for ticker in TICKERS:
        f = requests.get(URL.format(ticker), stream = True)
        results = CIK_RE.findall(f.text)
        if len(results):
            results[0] = int(re.sub('\.[0]*', '.', results[0]))
            cik_dict[str(ticker).upper()] = str(results[0])
    f = open('cik_dict', 'w')   
    print(cik_dict)
    f.close()

In [10]:
with open('OTC_tickerList.txt', 'w') as f:
    print(getCIKs(tickerList), file=f)

{'ZXAIY': '1506756', 'LDKYQ': '1385424', 'LASLY': '1499673', 'LGFTY': '1412494', 'YZCAY': '1048098', 'ZAHLY': '1501176'}


### OTC stocks with CIK numbers
{'ZXAIY': '1506756', 'LDKYQ': '1385424', 'LASLY': '1499673', 'LGFTY': '1412494', 'YZCAY': '1048098', 'ZAHLY': '1501176'}

### Using the same method to obtain the CIK for NYSE and NASDAQ stocks:

{'YI': '1738906', 'VNET': '1508475', 'QFIN': '1741530', 'WBAI': '1517496', 'JOBS': '1295484', 'WUBA': '1525494', 'ATV': '1365742', 'AMCN': '1413745', 'BABA': '1577552', 'ACH': '1161611', 'ATAI': '1420529', 'JG': '1737339', 'ATHM': '1527636', 'BIDU': '1329099', 'BZUN': '1625414', 'BILI': '1723690', 'BEDU': '1696355', 'CANG': '1725123', 'CYOU': '1458696', 'CMCM': '1597835', 'DL': '1438644', 'CEA': '1030475', 'JRJC': '1297830', 'LFC': '1268896', 'HTHT': '1483994', 'CHL': '1117795', 'CEO': '1095595', 'BORN': '1490366', 'COE': '1659494', 'SNP': '1123658', 'XRF': '1346610', 'ZNH': '1041668', 'CNTF': '1316317', 'CHA': '1191255', 'CHU': '1113866', 'CCIH': '1498576', 'CCM': '1472072', 'CTRP': '1269238', 'DQ': '1477641', 'EHIC': '1517492', 'SFUN': '1294404', 'FANH': '1413855', 'GDS': '1526125', 'GHG': '1724755', 'GSUM': '1647338', 'GSH': '1012139', 'HLG': '1596964', 'HQCL': '1371541', 'HX': '1702318', 'HMI': '1720446', 'HNP': '929058', 'HCM': '1648257', 'HUYA': '1728190', 'KANG': '1524190', 'IQ': '1722608', 'JD': '1549802', 'JT': '1713923', 'JKS': '1481513', 'JMU': '1527762', 'JMEI': '1597680', 'JP': '1616291', 'KZ': '1285137', 'LEJU': '1596856', 'LX': '1708259', 'LITB': '1523836', 'MOMO': '1610601', 'NTES': '1110646', 'EDU': '1372920', 'NIO': '1736541', 'NOAH': '1499543', 'ONE': '1722380', 'OSN': '1485538', 'PTR': '1108329', 'FENG': '1509646', 'PDD': '1737806', 'PPDF': '1691445', 'QD': '1692705', 'SOL': '1417892', 'RENN': '1509223', 'REDU': '1712178', 'RYB': '1708441', 'SECO': '1633441', 'SHI': '908732', 'SKYS': '1594124', 'SOGO': '1713947', 'SOHU': '1734107', 'TAL': '1499620', 'TEDU': '1592560', 'TME': '1744676', 'NCTY': '1296774', 'TC': '1743340', 'TOUR': '1597095', 'VIPS': '1529192', 'WB': '1595761', 'XYF': '1725033', 'XIN': '1398453', 'XNET': '1510593', 'YIN': '1661125', 'YRD': '1631761', 'YY': '1530238', 'ZLAB': '1704292', 'ZPIN': '1378564', 'ZTO': '1677250'}

In [11]:
cik_lookup = {'YI': '1738906', 'VNET': '1508475', 'QFIN': '1741530', 'WBAI': '1517496', 'JOBS': '1295484', 'WUBA': '1525494', 'ATV': '1365742', 'AMCN': '1413745', 'BABA': '1577552', 'ACH': '1161611', 'ATAI': '1420529', 'JG': '1737339', 'ATHM': '1527636', 'BIDU': '1329099', 'BZUN': '1625414', 'BILI': '1723690', 'BEDU': '1696355', 'CANG': '1725123', 'CYOU': '1458696', 'CMCM': '1597835', 'DL': '1438644', 'CEA': '1030475', 'JRJC': '1297830', 'LFC': '1268896', 'HTHT': '1483994', 'CHL': '1117795', 'CEO': '1095595', 'BORN': '1490366', 'COE': '1659494', 'SNP': '1123658', 'XRF': '1346610', 'ZNH': '1041668', 'CNTF': '1316317', 'CHA': '1191255', 'CHU': '1113866', 'CCIH': '1498576', 'CCM': '1472072', 'CTRP': '1269238', 'DQ': '1477641', 'EHIC': '1517492', 'SFUN': '1294404', 'FANH': '1413855', 'GDS': '1526125', 'GHG': '1724755', 'GSUM': '1647338', 'GSH': '1012139', 'HLG': '1596964', 'HQCL': '1371541', 'HX': '1702318', 'HMI': '1720446', 'HNP': '929058', 'HCM': '1648257', 'HUYA': '1728190', 'KANG': '1524190', 'IQ': '1722608', 'JD': '1549802', 'JT': '1713923', 'JKS': '1481513', 'JMU': '1527762', 'JMEI': '1597680', 'JP': '1616291', 'KZ': '1285137', 'LEJU': '1596856', 'LX': '1708259', 'LITB': '1523836', 'MOMO': '1610601', 'NTES': '1110646', 'EDU': '1372920', 'NIO': '1736541', 'NOAH': '1499543', 'ONE': '1722380', 'OSN': '1485538', 'PTR': '1108329', 'FENG': '1509646', 'PDD': '1737806', 'PPDF': '1691445', 'QD': '1692705', 'SOL': '1417892', 'RENN': '1509223', 'REDU': '1712178', 'RYB': '1708441', 'SECO': '1633441', 'SHI': '908732', 'SKYS': '1594124', 'SOGO': '1713947', 'SOHU': '1734107', 'TAL': '1499620', 'TEDU': '1592560', 'TME': '1744676', 'NCTY': '1296774', 'TC': '1743340', 'TOUR': '1597095', 'VIPS': '1529192', 'WB': '1595761', 'XYF': '1725033', 'XIN': '1398453', 'XNET': '1510593', 'YIN': '1661125', 'YRD': '1631761', 'YY': '1530238', 'ZLAB': '1704292', 'ZPIN': '1378564', 'ZTO': '1677250', 'ZXAIY': '1506756', 'LDKYQ': '1385424', 'LASLY': '1499673', 'LGFTY': '1412494', 'YZCAY': '1048098', 'ZAHLY': '1501176'}

In [12]:
with open('combined_tickerList_dictionary.txt', 'w') as f:
    print(cik_lookup, file=f)

### Get list of 20-Fs
The SEC has a limit on the number of calls you can make to the website per second. In order to avoid hiding that limit, we've created the `SecAPI` class. This will cache data from the SEC and prevent you from going over the limit.

In [13]:
sec_api = SecAPI()

With the class constructed, let's pull a list of filled 20-Fs from the SEC for each company.

It is important to know the particular company's fiscal year in order to distinguish financial quarterly related 6-Ks from other informational disclosure 6-Ks. At the end of the fiscal year and last quarter, Chinese ADRs file a 20-F report, which provides the annual statements for the year.

In [14]:
from bs4 import BeautifulSoup

def get_sec_data(cik, doc_type, start=0, count=60):
    newest_pricing_data = pd.to_datetime('2019-05-13')
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    sec_data = sec_api.get(rss_url)
    feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
    entries = [
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        for entry in feed.find_all('entry', recursive=False)
        if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]

    return entries

Let's pull the list using the `get_sec_data` function, then display some of the results. For displaying some of the data, we'll use ZXAIY as an example. 

In [15]:
example_ticker = 'ZXAIY'
sec_data = {}

for ticker, cik in cik_lookup.items():
    sec_data[ticker] = get_sec_data(cik, '20-F')

pprint.pprint(sec_data[example_ticker][:5])

[('https://www.sec.gov/Archives/edgar/data/1506756/000119312519120866/0001193125-19-120866-index.htm',
  '20-F',
  '2019-04-26'),
 ('https://www.sec.gov/Archives/edgar/data/1506756/000119312518136578/0001193125-18-136578-index.htm',
  '20-F',
  '2018-04-27'),
 ('https://www.sec.gov/Archives/edgar/data/1506756/000119312517145094/0001193125-17-145094-index.htm',
  '20-F',
  '2017-04-28'),
 ('https://www.sec.gov/Archives/edgar/data/1506756/000119312516560893/0001193125-16-560893-index.htm',
  '20-F',
  '2016-04-28'),
 ('https://www.sec.gov/Archives/edgar/data/1506756/000119312515161773/0001193125-15-161773-index.htm',
  '20-F',
  '2015-04-30')]


### Download 20-Fs
As you see, this is a list of urls. These urls point to a file that contains metadata related to each filling. Since we don't care about the metadata, we'll pull the filling by replacing the url with the filling url.

In [16]:
raw_fillings_by_ticker = {}

for ticker, data in sec_data.items():
    raw_fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        if (file_type == '20-F'):
            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
            raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)

print('Example Document:\n\n{}...'.format(next(iter(raw_fillings_by_ticker[example_ticker].values()))[:1000]))

Downloading YI Fillings: 100%|██████████| 1/1 [00:00<00:00,  2.07filling/s]
Downloading VNET Fillings: 100%|██████████| 9/9 [00:04<00:00,  2.05filling/s]
Downloading QFIN Fillings: 100%|██████████| 1/1 [00:00<00:00,  2.02filling/s]
Downloading WBAI Fillings: 100%|██████████| 7/7 [00:03<00:00,  1.98filling/s]
Downloading JOBS Fillings: 100%|██████████| 16/16 [00:07<00:00,  2.32filling/s]
Downloading WUBA Fillings: 100%|██████████| 9/9 [00:03<00:00,  2.24filling/s]
Downloading ATV Fillings: 100%|██████████| 16/16 [00:05<00:00,  2.50filling/s]
Downloading AMCN Fillings: 100%|██████████| 14/14 [00:05<00:00,  2.43filling/s]
Downloading BABA Fillings: 100%|██████████| 4/4 [00:02<00:00,  1.58filling/s]
Downloading ACH Fillings: 100%|██████████| 28/28 [00:12<00:00,  2.34filling/s]
Downloading ATAI Fillings: 100%|██████████| 13/13 [00:06<00:00,  2.17filling/s]
Downloading JG Fillings: 100%|██████████| 1/1 [00:00<00:00,  1.84filling/s]
Downloading ATHM Fillings: 100%|██████████| 6/6 [00:03<00:00

Example Document:

<SEC-DOCUMENT>0001193125-19-120866.txt : 20190426
<SEC-HEADER>0001193125-19-120866.hdr.sgml : 20190426
<ACCEPTANCE-DATETIME>20190426072508
ACCESSION NUMBER:		0001193125-19-120866
CONFORMED SUBMISSION TYPE:	20-F
PUBLIC DOCUMENT COUNT:		157
CONFORMED PERIOD OF REPORT:	20181231
FILED AS OF DATE:		20190426
DATE AS OF CHANGE:		20190426

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			China Zenix Auto International Ltd
		CENTRAL INDEX KEY:			0001506756
		STANDARD INDUSTRIAL CLASSIFICATION:	MOTOR VEHICLE PARTS & ACCESSORIES [3714]
		IRS NUMBER:				000000000

	FILING VALUES:
		FORM TYPE:		20-F
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-35154
		FILM NUMBER:		19769356

	BUSINESS ADDRESS:	
		STREET 1:		1608, North Circle Road State Highway
		STREET 2:		Zhangzhou
		CITY:			Fujian Province
		STATE:			F4
		ZIP:			363000
		BUSINESS PHONE:		(86) 596-2600308

	MAIL ADDRESS:	
		STREET 1:		1608, North Circle Road State Highway
		STREET 2:		Zhangzhou
		CITY:			Fujian Province
		STA

### Get Documents
With theses fillings downloaded, we want to break them into their associated documents. These documents are sectioned off in the fillings with the tags `<DOCUMENT>` for the start of each document and `</DOCUMENT>` for the end of each document. There's no overlap with these documents, so each `</DOCUMENT>` tag should come after the `<DOCUMENT>` with no `<DOCUMENT>` tag in between.

Implement `get_documents` to return a list of these documents from a filling. Make sure not to include the tag in the returned document text.

In [17]:
import re

def get_documents(text):
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
    extracted_docs = []
    for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
        doc = text[doc_start_i:doc_end_i]
        extracted_docs.append(doc)
    return extracted_docs

With the `get_documents` function implemented, let's extract all the documents.

In [18]:
filling_documents_by_ticker = {}

for ticker, raw_fillings in raw_fillings_by_ticker.items():
    filling_documents_by_ticker[ticker] = {}
    for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
        filling_documents_by_ticker[ticker][file_date] = get_documents(filling)


print('\n\n'.join([
    'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
    for file_date, docs in filling_documents_by_ticker[example_ticker].items()
    for doc_i, doc in enumerate(docs)][:3]))

Getting Documents from YI Fillings: 100%|██████████| 1/1 [00:00<00:00, 32.35filling/s]
Getting Documents from VNET Fillings: 100%|██████████| 8/8 [00:00<00:00,  9.33filling/s]
Getting Documents from QFIN Fillings: 100%|██████████| 1/1 [00:00<00:00, 27.82filling/s]
Getting Documents from WBAI Fillings: 100%|██████████| 6/6 [00:00<00:00, 11.73filling/s]
Getting Documents from JOBS Fillings: 100%|██████████| 15/15 [00:00<00:00, 18.43filling/s]
Getting Documents from WUBA Fillings: 100%|██████████| 6/6 [00:00<00:00, 29.20filling/s]
Getting Documents from ATV Fillings: 100%|██████████| 12/12 [00:00<00:00, 48.12filling/s]
Getting Documents from AMCN Fillings: 100%|██████████| 12/12 [00:00<00:00, 63.66filling/s]
Getting Documents from BABA Fillings: 100%|██████████| 4/4 [00:00<00:00, 22.79filling/s]
Getting Documents from ACH Fillings: 100%|██████████| 20/20 [00:00<00:00, 25.98filling/s]
Getting Documents from ATAI Fillings: 100%|██████████| 12/12 [00:00<00:00, 20.53filling/s]
Getting Documen

Document 0 Filed on 2019-04-26:

<TYPE>20-F
<SEQUENCE>1
<FILENAME>d684592d20f.htm
<DESCRIPTION>FORM 20-F
<TEXT>
<HTML><HEAD>
<TITLE>Form 20-F</TITLE>
</HEAD>
 <BODY BGCOLOR="WHITE">
<h5 align="left"><a href="#toc">Table of Contents<...

Document 1 Filed on 2019-04-26:

<TYPE>EX-8.1
<SEQUENCE>2
<FILENAME>d684592dex81.htm
<DESCRIPTION>EX-8.1
<TEXT>
<HTML><HEAD>
<TITLE>EX-8.1</TITLE>
</HEAD>
 <BODY BGCOLOR="WHITE">


<Center><DIV STYLE="width:8.5in" align="left">
 <P ...

Document 2 Filed on 2019-04-26:

<TYPE>EX-12.1
<SEQUENCE>3
<FILENAME>d684592dex121.htm
<DESCRIPTION>EX-12.1
<TEXT>
<HTML><HEAD>
<TITLE>EX-12.1</TITLE>
</HEAD>
 <BODY BGCOLOR="WHITE">


<Center><DIV STYLE="width:8.5in" align="left">
...


### Get Document Types
Now that we have all the documents, we want to find the 20-f form in this 20-f filing. Implement the `get_document_type` function to return the type of document given. The document type is located on a line with the `<TYPE>` tag. For example, a form of type "TEST" would have the line `<TYPE>TEST`. Make sure to return the type as lowercase, so this example would be returned as "test".

In [19]:
def get_document_type(doc):
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_type = [x[len('<TYPE>'):] for x in type_pattern.findall(doc)]
    return doc_type[0].lower()

With the `get_document_type` function, we'll filter out all non 20-F documents.

In [20]:
twenty_Fs_by_ticker = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
    twenty_Fs_by_ticker[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if get_document_type(document) == '20-f':
                twenty_Fs_by_ticker[ticker].append({
                    'cik': cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date})

In [21]:
print_twenty_F_data(twenty_Fs_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])

[
  {
    cik: '1506756'
    file: '\n<TYPE>20-F\n<SEQUENCE>1\n<FILENAME>d684592d20f....
    file_date: '2019-04-26'},
  {
    cik: '1506756'
    file: '\n<TYPE>20-F\n<SEQUENCE>1\n<FILENAME>d511822d20f....
    file_date: '2018-04-27'},
  {
    cik: '1506756'
    file: '\n<TYPE>20-F\n<SEQUENCE>1\n<FILENAME>d254317d20f....
    file_date: '2017-04-28'},
  {
    cik: '1506756'
    file: '\n<TYPE>20-F\n<SEQUENCE>1\n<FILENAME>d156239d20f....
    file_date: '2016-04-28'},
  {
    cik: '1506756'
    file: '\n<TYPE>20-F\n<SEQUENCE>1\n<FILENAME>d912903d20f....
    file_date: '2015-04-30'},
]


## Preprocess the Data
### Clean Up
The text for the documents are very messy. To clean this up, we'll remove the html and lowercase all the text.

In [22]:
def remove_html_tags(text):
    text = BeautifulSoup(text, 'html.parser').get_text()    
    return text

In [23]:
def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)    
    return text

Using the `clean_text` function, we'll clean up all the documents.

In [24]:
# parallelize 

for ticker, twenty_Fs in twenty_Fs_by_ticker.items():
    for twenty_F in tqdm(twenty_Fs, desc='Cleaning {} 20-Fs'.format(ticker), unit='20-F'):
        twenty_F['file_clean'] = clean_text(twenty_F['file'])

print_twenty_F_data(twenty_Fs_by_ticker[example_ticker][:5], ['file_clean'])

Cleaning YI 20-Fs: 100%|██████████| 1/1 [00:01<00:00,  1.92s/20-F]
Cleaning VNET 20-Fs: 100%|██████████| 8/8 [00:20<00:00,  2.54s/20-F]
Cleaning QFIN 20-Fs: 100%|██████████| 1/1 [00:01<00:00,  1.45s/20-F]
Cleaning WBAI 20-Fs: 100%|██████████| 6/6 [00:07<00:00,  1.19s/20-F]
Cleaning JOBS 20-Fs: 100%|██████████| 15/15 [00:20<00:00,  1.36s/20-F]
Cleaning WUBA 20-Fs: 100%|██████████| 6/6 [00:04<00:00,  1.2620-F/s]
Cleaning ATV 20-Fs: 100%|██████████| 12/12 [00:17<00:00,  1.86s/20-F]
Cleaning AMCN 20-Fs: 100%|██████████| 12/12 [00:12<00:00,  1.24s/20-F]
Cleaning BABA 20-Fs: 100%|██████████| 4/4 [00:18<00:00,  4.49s/20-F]
Cleaning ACH 20-Fs: 100%|██████████| 20/20 [01:04<00:00,  3.21s/20-F]
Cleaning ATAI 20-Fs: 100%|██████████| 12/12 [00:18<00:00,  1.63s/20-F]
Cleaning JG 20-Fs: 100%|██████████| 1/1 [00:01<00:00,  1.25s/20-F]
Cleaning ATHM 20-Fs: 100%|██████████| 6/6 [00:08<00:00,  1.46s/20-F]
Cleaning BIDU 20-Fs: 100%|██████████| 14/14 [00:23<00:00,  1.37s/20-F]
Cleaning BZUN 20-Fs: 100%|██

[
  {
    file_clean: '\n20-f\n1\nd684592d20f.htm\nform 20-f\n\n\nform 2...},
  {
    file_clean: '\n20-f\n1\nd511822d20f.htm\nform 20-f\n\n\nform 2...},
  {
    file_clean: '\n20-f\n1\nd254317d20f.htm\nform 20-f\n\n\nform 2...},
  {
    file_clean: '\n20-f\n1\nd156239d20f.htm\nform 20-f\n\n\nform 2...},
  {
    file_clean: '\n20-f\n1\nd912903d20f.htm\nform 20-f\n\n\nform 2...},
]


### Lemmatize
With the text cleaned up, it's time to distill the verbs down. Implement the `lemmatize_words` function to lemmatize verbs in the list of words provided.

In [25]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def lemmatize_words(words):
    lemmatized_words = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]    
    return lemmatized_words

With the `lemmatize_words` function implemented, let's lemmatize all the data.

In [26]:
len(twenty_Fs_by_ticker)

109

In [27]:
l = [[k,v] for k,v in twenty_Fs_by_ticker.items()]

In [28]:
from numba import njit, prange
import numpy as np

In [29]:


# def lemmatize_words(words):
#     lemmatized_words =    
#     return lemmatized_words


@njit(parallel=True)
def prange_test():
    s = 0
    # Without "parallel=True" in the jit-decorator
    # the prange statement is equivalent to range
    
    word_pattern = re.compile('\w+')


    for i in prange(len(l)):
        ticker,twenty_Fs = l[i]
#         k = [[k,v] for k,v in twenty_Fs.items()]
        for j in prange(len(twenty_Fs)):
#         for twenty_F in tqdm(twenty_Fs, desc='Lemmatize {} 20-Fs'.format(ticker), unit='20-F'):
            twenty_Fs[j]['file_lemma'] = [WordNetLemmatizer().lemmatize(w, pos='v') for w in word_pattern.findall(twenty_Fs[j]['file_clean'])]

    return

In [30]:
from spacy.vocab import Vocab
from spacy.language import Language
nlp = Language(Vocab())

from spacy.lang.en import English
nlp = English()

In [31]:
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)


In [32]:
tokenizer.

SyntaxError: invalid syntax (<ipython-input-32-c9a3152c632d>, line 1)

In [None]:
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
lemmas = lemmatizer(u"ducks", u"NOUN")
assert lemmas == [u"duck"]

In [None]:
prange_test()

In [None]:
# para it.....

word_pattern = re.compile('\w+')

for ticker, twenty_Fs in twenty_Fs_by_ticker.items():
    for twenty_F in tqdm(twenty_Fs, desc='Lemmatize {} 20-Fs'.format(ticker), unit='20-F'):
        twenty_F['file_lemma'] = lemmatize_words(word_pattern.findall(twenty_F['file_clean']))

print_twenty_F_data(twenty_Fs_by_ticker[example_ticker][:5], ['file_lemma'])

### Remove Stopwords

In [None]:
from nltk.corpus import stopwords


lemma_english_stopwords = lemmatize_words(stopwords.words('english'))

for ticker, twenty_Fs in twenty_Fs_by_ticker.items():
    for twenty_F in tqdm(twenty_Fs, desc='Remove Stop Words for {} 20-Fs'.format(ticker), unit='20-F'):
        twenty_F['file_lemma'] = [word for word in twenty_F['file_lemma'] if word not in lemma_english_stopwords]


print('Stop Words Removed')

## Analysis on 20fs
### Loughran McDonald Sentiment Word Lists
We'll be using the Loughran and McDonald sentiment word lists. These word lists cover the following sentiment:
- Negative 
- Positive
- Uncertainty
- Litigious
- Constraining
- Superfluous
- Modal

This will allow us to do the sentiment analysis on the 20-Fs. Let's first load these word lists. We'll be looking into a few of these sentiments.

In [None]:
sentiment_df = pd.read_csv('LoughranMcDonald_MasterDictionary_2018.csv')

In [None]:
sentiments = ['negative', 'positive', 'uncertainty', 'litigious', 'constraining', 'interesting']
sentiment_df.columns = [column.lower() for column in sentiment_df.columns] # Lowercase the columns for ease of use

# Remove unused information
sentiment_df = sentiment_df[sentiments + ['word']]
sentiment_df[sentiments] = sentiment_df[sentiments].astype(bool)
sentiment_df = sentiment_df[(sentiment_df[sentiments]).any(1)]

# Apply the same preprocessing to these words as the 20-F words
sentiment_df['word'] = lemmatize_words(sentiment_df['word'].str.lower())
sentiment_df = sentiment_df.drop_duplicates('word')

sentiment_df.head()

In [None]:
sentiment_df.to_csv('sentimentDataFrame.csv')

### Bag of Words
using the sentiment word lists, let's generate sentiment bag of words from the 20-F documents. Implement `get_bag_of_words` to generate a bag of words that counts the number of sentiment words in each doc. You can ignore words that are not in `sentiment_words`.

In [None]:
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer


def get_bag_of_words(sentiment_words, docs):
    cnt = Counter()
    
    cv = CountVectorizer(vocabulary=sentiment_words)
    bag_of_words =  cv.fit_transform(docs).toarray()
    
    return bag_of_words

Using the `get_bag_of_words` function, we'll generate a bag of words for all the documents.

In [None]:
sentiment_bow_twenty_Fs = {}

for ticker, twenty_Fs in twenty_Fs_by_ticker.items():
    lemma_docs = [' '.join(twenty_F['file_lemma']) for twenty_F in twenty_Fs]
    
    sentiment_bow_twenty_Fs[ticker] = {
        sentiment: get_bag_of_words(sentiment_df[sentiment_df[sentiment]]['word'], lemma_docs)
        for sentiment in sentiments}

print_twenty_F_data([sentiment_bow_twenty_Fs[example_ticker]], sentiments)

### Jaccard Similarity
Using the bag of words, let's calculate the jaccard similarity on the bag of words and plot it over time. Implement `get_jaccard_similarity` to return the jaccard similarities between each tick in time. Since the input, `bag_of_words_matrix`, is a bag of words for each time period in order, you just need to compute the jaccard similarities for each neighboring bag of words. Make sure to turn the bag of words into a boolean array when calculating the jaccard similarity.

In [None]:
from sklearn.metrics import jaccard_similarity_score


def get_jaccard_similarity(bag_of_words_matrix):
    bool_array = np.array(bag_of_words_matrix, dtype=bool)
    jaccard_similarities = [jaccard_similarity_score(bool_array[i],bool_array[i+1]) \
                           for i in range(len(bool_array)-1)]
    return jaccard_similarities

Using the `get_jaccard_similarity` function, let's plot the similarities over time.

In [None]:
# Get dates for the universe
file_dates = {
    ticker: [twenty_F['file_date'] for twenty_F in twenty_Fs]
    for ticker, twenty_Fs in twenty_Fs_by_ticker.items()}  

jaccard_similarities = {
    ticker: {
        sentiment_name: get_jaccard_similarity(sentiment_values)
        for sentiment_name, sentiment_values in twenty_F_sentiments.items()}
    for ticker, twenty_F_sentiments in sentiment_bow_twenty_Fs.items()}


plot_similarities(
    [jaccard_similarities[example_ticker][sentiment] for sentiment in sentiments],
    file_dates[example_ticker][1:],
    'Jaccard Similarities for {} Sentiment'.format(example_ticker),
    sentiments)

### TFIDF
using the sentiment word lists, hereby generate sentiment TFIDF from the 20-f documents, i.e. the more the word appears, the more weighting is given to the sentiment associated with the word. Here `get_tfidf` generates TFIDF from each document, using sentiment words as the terms, words that are not in `sentiment_words` were ignored.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


def get_tfidf(sentiment_words, docs):
    tf_idfVector = TfidfVectorizer(vocabulary = sentiment_words)
    tfidf = tf_idfVector.fit_transform(docs).toarray()
    return tfidf

Using the `get_tfidf` function, let's generate the TFIDF values for all the documents.

In [None]:
sentiment_df[sentiment]['word']

In [None]:
sentiment_tfidf_twenty_Fs = {}

for ticker, twenty_Fs in twenty_Fs_by_ticker.items():
    lemma_docs = [' '.join(twenty_F['file_lemma']) for twenty_F in twenty_Fs]
    
    sentiment_tfidf_twenty_Fs[ticker] = { sentiment: get_tfidf(sentiment_df[sentiment_df[sentiment]]['word'], lemma_docs) for sentiment in sentiments}

    
print_twenty_F_data([sentiment_tfidf_twenty_Fs[example_ticker]], sentiments)

### Cosine Similarity
Using the TFIDF values, the cosine similarity over time was calculated and plotted. Here `get_cosine_similarity` was used to return the cosine similarities between each tick in time. The input `tfidf_matrix` is a TFIDF vector for each time period in order, thus only the cosine similarities for each neighboring vector need be computed.

In [None]:
from itertools import combinations



In [None]:
sentiment_tfidf_twenty_Fs['YI']['negative']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances


# cosine_similarities(sentiments)

def get_cosine_similarity(sentiment_tfidf_twenty_Fs):

    sentiments_cosine_similarity ={}
    
    for i,j in combinations(sentiment_tfidf_twenty_Fs.keys(),2):
        u = sentiment_tfidf_twenty_Fs[i]
        v = sentiment_tfidf_twenty_Fs[j]
        for sentiment in sentiments:
#             print(i,j,sentiment,cosine_similarity(u[sentiment],v[sentiment]))
            u = tfidf_matrix[i].reshape(1,-1)
            v = tfidf_matrix[i+1].reshape(1,-1)
            cosine_similarities.append(cosine_similarity(u,v)[0][0])

            break
# def get_cosine_similarity(tfidf_matrix):
# # Get cosine similarities for each neighboring TFIDF vector/document
# # tfidf : 2-d Numpy Ndarray of floatTFIDF sentiment for each document, the first dimension is the document, the second dimension is the word
#     cosine_similarities = []
#     for i in range(len(tfidf_matrix)-1):
#         u = tfidf_matrix[i].reshape(1,-1)
#         v = tfidf_matrix[i+1].reshape(1,-1)
#         cosine_similarities.append(cosine_similarity(u,v)[0][0])
#     return cosine_similarities

# Returns cosine_similarities: list of float
# Cosine similarities for neighboring documents

In [None]:
v[sentiment]

In [None]:
sentiment_tfidf_twenty_Fs['YI']['negative']

In [None]:
sentiment_tfidf_twenty_Fs['YI']['positive']

In [None]:
 get_cosine_similarity(sentiment_tfidf_twenty_Fs)

Let's plot the cosine similarities over time.

In [None]:
cosine_similarities = {
    ticker: {
        sentiment_name: get_cosine_similarity(sentiment_values)
        for sentiment_name, sentiment_values in twenty_F_sentiments.items()}
    for ticker, twenty_F_sentiments in sentiment_tfidf_twenty_Fs.items()}


plot_similarities(
    [cosine_similarities[example_ticker][sentiment] for sentiment in sentiments],
    file_dates[example_ticker][1:],
    'Cosine Similarities for {} Sentiment'.format(example_ticker),
    sentiments)

In [None]:
cosine_similarities

## Save outputs as text files

In [None]:
import json

In [None]:
jaccard_similarities_df = pd.DataFrame.from_dict(jaccard_similarities)
jaccard_similarities_df.to_json('jaccard_similarities.json')

sentiment_tfidf_twenty_Fs_df = pd.DataFrame.from_dict(jaccard_similarities)
sentiment_tfidf_twenty_Fs_df.to_jason('sentiment_tfidf_twenty_Fs.json')

cosine_similarities_df = pd.DataFrame.from_dict(cosine_similarities)
cosine_similarities_df.to_jason('cosine_similarities.json')

In [None]:
with open('cosine_similarities.json', 'w') as file:
    file.write(json.dumps(cosine_similarities))

In [None]:
with open('filling_documents_by_ticker.json', 'w') as file:
    file.write(json.dumps(filling_documents_by_ticker))

In [None]:
with open('file_dates.json', 'w') as file:
    file.write(json.dumps(file_dates))

In [None]:
with open('twenty_Fs_by_ticker.json', 'w') as file:
    file.write(json.dumps(twenty_Fs_by_ticker))    