In [1]:
import pandas as pd
import numpy as np
import os
import re
import requests
import requests_random_user_agent
from bs4 import BeautifulSoup
import preprocess_data

[nltk_data] Downloading package punkt to
[nltk_data]     /home/codevardhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codevardhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codevardhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Scraping

In [2]:
amazon_cik = '0001018724'

In [3]:
rss_url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={amazon_cik}&type=10-k&start=0&count=10000&owner=exclude&output=atom'
sec_data = requests.get(rss_url).text
feed = BeautifulSoup(sec_data.encode('ascii'), features="xml").feed

data = []

for entry in feed.find_all('entry', recursive=False):    
    data.append(
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText()
        )
    )



In [4]:
filings = dict()

for url, ftype, fdate in data:
    if ftype == '10-K':
        rurl = url.replace('-index.htm', '.txt')
        filings[fdate] = requests.get(rurl).text


In [5]:
df = pd.DataFrame([filings])
df.index = ['filing_data']
df.head()

Unnamed: 0,2022-02-04,2021-02-03,2020-01-31,2019-02-01,2018-02-02,2017-02-10,2016-01-29,2015-01-30,2014-01-31,2013-01-30,...,2010-01-29,2009-01-30,2008-02-11,2007-02-16,2006-02-17,2005-03-11,2004-02-25,2003-02-19,2000-03-29,1999-03-05
filing_data,<SEC-DOCUMENT>0001018724-22-000005.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000004.txt : 20210...,<SEC-DOCUMENT>0001018724-20-000004.txt : 20200...,<SEC-DOCUMENT>0001018724-19-000004.txt : 20190...,<SEC-DOCUMENT>0001018724-18-000005.txt : 20180...,<SEC-DOCUMENT>0001018724-17-000011.txt : 20170...,<SEC-DOCUMENT>0001018724-16-000172.txt : 20160...,<SEC-DOCUMENT>0001018724-15-000006.txt : 20150...,<SEC-DOCUMENT>0001018724-14-000006.txt : 20140...,<SEC-DOCUMENT>0001193125-13-028520.txt : 20130...,...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error..."


## Cleaning

In [6]:
documents = dict()

start = re.compile(r'<DOCUMENT>')
end = re.compile(r'</DOCUMENT>')   

for key, value in filings.items():
    tmp = []

    start_idxs = [x.end() for x in start.finditer(value)]
    end_idxs = [x.start() for x in end.finditer(value)]

    for idx1, idx2 in zip(start_idxs, end_idxs):
        tmp.append(value[idx1:idx2])
    
    documents[key] = tmp



In [7]:
df2 = pd.DataFrame([documents])
df2.index = ['documents']

df3 = pd.concat([df, df2])
df3.head()

Unnamed: 0,2022-02-04,2021-02-03,2020-01-31,2019-02-01,2018-02-02,2017-02-10,2016-01-29,2015-01-30,2014-01-31,2013-01-30,...,2010-01-29,2009-01-30,2008-02-11,2007-02-16,2006-02-17,2005-03-11,2004-02-25,2003-02-19,2000-03-29,1999-03-05
filing_data,<SEC-DOCUMENT>0001018724-22-000005.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000004.txt : 20210...,<SEC-DOCUMENT>0001018724-20-000004.txt : 20200...,<SEC-DOCUMENT>0001018724-19-000004.txt : 20190...,<SEC-DOCUMENT>0001018724-18-000005.txt : 20180...,<SEC-DOCUMENT>0001018724-17-000011.txt : 20170...,<SEC-DOCUMENT>0001018724-16-000172.txt : 20160...,<SEC-DOCUMENT>0001018724-15-000006.txt : 20150...,<SEC-DOCUMENT>0001018724-14-000006.txt : 20140...,<SEC-DOCUMENT>0001193125-13-028520.txt : 20130...,...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error..."
documents,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-202...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-202...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d445434d...,...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>v87419or...,[],[]


In [8]:
docs_10k = dict()

retype = re.compile(r'<TYPE>')

for key, value in documents.items():
    for idx, v in enumerate(value):
        start_idx = [x.end() for x in retype.finditer(v)][0]

        if v[start_idx:].split("\n")[0] == "10-K":
            docs_10k[key] = v


In [9]:
df2 = pd.DataFrame([docs_10k])
df2.index = ['docs_10k']

df4 = pd.concat([df3, df2])
df4.head()

Unnamed: 0,2022-02-04,2021-02-03,2020-01-31,2019-02-01,2018-02-02,2017-02-10,2016-01-29,2015-01-30,2014-01-31,2013-01-30,...,2010-01-29,2009-01-30,2008-02-11,2007-02-16,2006-02-17,2005-03-11,2004-02-25,2003-02-19,2000-03-29,1999-03-05
filing_data,<SEC-DOCUMENT>0001018724-22-000005.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000004.txt : 20210...,<SEC-DOCUMENT>0001018724-20-000004.txt : 20200...,<SEC-DOCUMENT>0001018724-19-000004.txt : 20190...,<SEC-DOCUMENT>0001018724-18-000005.txt : 20180...,<SEC-DOCUMENT>0001018724-17-000011.txt : 20170...,<SEC-DOCUMENT>0001018724-16-000172.txt : 20160...,<SEC-DOCUMENT>0001018724-15-000006.txt : 20150...,<SEC-DOCUMENT>0001018724-14-000006.txt : 20140...,<SEC-DOCUMENT>0001193125-13-028520.txt : 20130...,...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error..."
documents,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-202...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-202...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d445434d...,...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>v87419or...,[],[]
docs_10k,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2020...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2019...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2018...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2017...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2016...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2015...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2014...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2013...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d445434d1...,...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>v87419ore...,,


In [10]:
docs_10k_clean = dict()

for key, data in docs_10k.items():
    x = data.lower()
    x = BeautifulSoup(x, 'html.parser').get_text()
    docs_10k_clean[key] = x

df2 = pd.DataFrame([docs_10k_clean])
df2.index = ['docs_10k_clean']

df5 = pd.concat([df4, df2])
df5.head()

Unnamed: 0,2022-02-04,2021-02-03,2020-01-31,2019-02-01,2018-02-02,2017-02-10,2016-01-29,2015-01-30,2014-01-31,2013-01-30,...,2010-01-29,2009-01-30,2008-02-11,2007-02-16,2006-02-17,2005-03-11,2004-02-25,2003-02-19,2000-03-29,1999-03-05
filing_data,<SEC-DOCUMENT>0001018724-22-000005.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000004.txt : 20210...,<SEC-DOCUMENT>0001018724-20-000004.txt : 20200...,<SEC-DOCUMENT>0001018724-19-000004.txt : 20190...,<SEC-DOCUMENT>0001018724-18-000005.txt : 20180...,<SEC-DOCUMENT>0001018724-17-000011.txt : 20170...,<SEC-DOCUMENT>0001018724-16-000172.txt : 20160...,<SEC-DOCUMENT>0001018724-15-000006.txt : 20150...,<SEC-DOCUMENT>0001018724-14-000006.txt : 20140...,<SEC-DOCUMENT>0001193125-13-028520.txt : 20130...,...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<Error..."
documents,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-202...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-202...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-201...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d445434d...,...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm...,[\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>v87419or...,[],[]
docs_10k,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2020...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2019...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2018...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2017...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2016...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2015...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2014...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2013...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d445434d1...,...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>d10k.htm\...,\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>v87419ore...,,
docs_10k_clean,\n10-k\n1\namzn-20211231.htm\n10-k\n\n\n\namzn...,\n10-k\n1\namzn-20201231.htm\n10-k\n\n\n\namzn...,\n10-k\n1\namzn-20191231x10k.htm\n10-k\n\n\n\n...,\n10-k\n1\namzn-20181231x10k.htm\n10-k\n\n\n\n...,\n10-k\n1\namzn-20171231x10k.htm\n10-k\n\n\n\n...,\n10-k\n1\namzn-20161231x10k.htm\nform 10-k\n\...,\n10-k\n1\namzn-20151231x10k.htm\nform 10-k\n\...,\n10-k\n1\namzn-20141231x10k.htm\nform 10-k\n\...,\n10-k\n1\namzn-20131231x10k.htm\nform 10-k\n\...,\n10-k\n1\nd445434d10k.htm\nform 10-k\n\n\nfor...,...,\n10-k\n1\nd10k.htm\nform 10-k\n\n\nform 10-k\...,\n10-k\n1\nd10k.htm\nform 10-k\n\n\nform 10-k\...,\n10-k\n1\nd10k.htm\nform 10-k\n\n\nform 10-k\...,\n10-k\n1\nd10k.htm\nform 10-k\n\n\nform 10-k\...,\n10-k\n1\nd10k.htm\nannual report\n\n\nannual...,\n10-k\n1\nd10k.htm\nannual report for the fis...,\n10-k\n1\nd10k.htm\nform 10-k for the fiscal ...,\n10-k\n1\nv87419ore10vk.htm\nform 10-k\n\n\n\...,,


In [11]:
dates = df5.columns

In [12]:
df5 = df5.T
df6=pd.DataFrame()
df6["raw_data"]=df5["docs_10k_clean"]

In [13]:
df6.head()

Unnamed: 0,raw_data
2022-02-04,\n10-k\n1\namzn-20211231.htm\n10-k\n\n\n\namzn...
2021-02-03,\n10-k\n1\namzn-20201231.htm\n10-k\n\n\n\namzn...
2020-01-31,\n10-k\n1\namzn-20191231x10k.htm\n10-k\n\n\n\n...
2019-02-01,\n10-k\n1\namzn-20181231x10k.htm\n10-k\n\n\n\n...
2018-02-02,\n10-k\n1\namzn-20171231x10k.htm\n10-k\n\n\n\n...


In [14]:
df6["raw_data"] = df6["raw_data"].apply(lambda x: str(x))

df6=preprocess_data.process(df6)
df6["final_cleaned"] = df6["cleaned_no_stop_punc_data"].apply(lambda x: ' '.join(s for s in x.split() if not any(c.isdigit() for c in s)))
df6.head()

Unnamed: 0,raw_data,cleaned_data,cleaned_no_stop_punc_data,final_cleaned
2022-02-04,\n10-k\n1\namzn-20211231.htm\n10-k\n\n\n\namzn...,10-k amzn-20211231.htm 10-k of contentsunited ...,10 k amzn 20211231 htm 10 k of contentsunited ...,k amzn htm k of contentsunited statessecuritie...
2021-02-03,\n10-k\n1\namzn-20201231.htm\n10-k\n\n\n\namzn...,10-k amzn-20201231.htm 10-k amzn-20201231false...,10 k amzn 20201231 htm 10 k amzn 20201231false...,k amzn htm k amzn gaap accruedliabilitiescurre...
2020-01-31,\n10-k\n1\namzn-20191231x10k.htm\n10-k\n\n\n\n...,10-k amzn-20191231x10k.htm 10-k document 0.5p1...,10 k amzn 20191231x10k htm 10 k document 0 5p1...,k amzn htm k document us gaap productmember us...
2019-02-01,\n10-k\n1\namzn-20181231x10k.htm\n10-k\n\n\n\n...,10-k amzn-20181231x10k.htm 10-k document table...,10 k amzn 20181231x10k htm 10 k document table...,k amzn htm k document table of contents united...
2018-02-02,\n10-k\n1\namzn-20171231x10k.htm\n10-k\n\n\n\n...,10-k amzn-20171231x10k.htm 10-k document unite...,10 k amzn 20171231x10k htm 10 k document unite...,k amzn htm k document united statessecurities ...


In [15]:
df6.shape

(22, 4)

In [16]:
data=pd.DataFrame()
data["dates"]=dates
data['data'] = df6['final_cleaned'].values

In [17]:
data.head()

Unnamed: 0,dates,data
0,2022-02-04,k amzn htm k of contentsunited statessecuritie...
1,2021-02-03,k amzn htm k amzn gaap accruedliabilitiescurre...
2,2020-01-31,k amzn htm k document us gaap productmember us...
3,2019-02-01,k amzn htm k document table of contents united...
4,2018-02-02,k amzn htm k document united statessecurities ...


In [19]:
data.to_csv('output/sec_10k.csv', index=False)