In [1]:
import pandas as pd
import numpy as np
import os
import re
import requests
import requests_random_user_agent
from bs4 import BeautifulSoup
import preprocess_data

[nltk_data] Downloading package punkt to
[nltk_data]     /home/codevardhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codevardhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codevardhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Scraping

In [2]:
amazon_cik = '0001018724'

In [3]:
rss_url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={amazon_cik}&type=8-k&start=0&count=10000&owner=exclude&output=atom'
sec_data = requests.get(rss_url).text
feed = BeautifulSoup(sec_data.encode('ascii'), features="xml").feed

data = []

for entry in feed.find_all('entry', recursive=False):    
    data.append(
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText()
        )
    )



In [4]:
filings = dict()

for url, ftype, fdate in data:
    if ftype == '8-K':
        rurl = url.replace('-index.htm', '.txt')
        filings[fdate] = requests.get(rurl).text


In [5]:
df = pd.DataFrame([filings])
df.index = ['filing_data']
df.head()

Unnamed: 0,2022-04-28,2022-04-14,2022-04-13,2022-03-09,2022-02-03,2021-11-12,2021-10-28,2021-09-20,2021-07-29,2021-07-02,...,2013-05-24,2013-04-25,2013-04-12,2013-01-29,2012-12-28,2012-11-29,2012-10-25,2012-10-05,2012-07-26,2012-05-25
filing_data,<SEC-DOCUMENT>0001018724-22-000011.txt : 20220...,<SEC-DOCUMENT>0001104659-22-045579.txt : 20220...,<SEC-DOCUMENT>0001193125-22-104336.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000009.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000002.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000030.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000026.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000022.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000018.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000016.txt : 20210...,...,<SEC-DOCUMENT>0001193125-13-235908.txt : 20130...,<SEC-DOCUMENT>0001193125-13-173923.txt : 20130...,<SEC-DOCUMENT>0001193125-13-151836.txt : 20130...,<SEC-DOCUMENT>0001193125-13-028189.txt : 20130...,<SEC-DOCUMENT>0001193125-12-518418.txt : 20121...,<SEC-DOCUMENT>0001193125-12-485763.txt : 20121...,<SEC-DOCUMENT>0001193125-12-435477.txt : 20121...,<SEC-DOCUMENT>0001193125-12-416835.txt : 20121...,<SEC-DOCUMENT>0001193125-12-316828.txt : 20120...,<SEC-DOCUMENT>0001193125-12-249407.txt : 20120...


## Cleaning

In [6]:
documents = dict()

start = re.compile(r'<DOCUMENT>')
end = re.compile(r'</DOCUMENT>')   

for key, value in filings.items():
    tmp = []

    start_idxs = [x.end() for x in start.finditer(value)]
    end_idxs = [x.start() for x in end.finditer(value)]

    for idx1, idx2 in zip(start_idxs, end_idxs):
        tmp.append(value[idx1:idx2])
    
    documents[key] = tmp



In [7]:
df2 = pd.DataFrame([documents])
df2.index = ['documents']

df3 = pd.concat([df, df2])
df3.head()

Unnamed: 0,2022-04-28,2022-04-14,2022-04-13,2022-03-09,2022-02-03,2021-11-12,2021-10-28,2021-09-20,2021-07-29,2021-07-02,...,2013-05-24,2013-04-25,2013-04-12,2013-01-29,2012-12-28,2012-11-29,2012-10-25,2012-10-05,2012-07-26,2012-05-25
filing_data,<SEC-DOCUMENT>0001018724-22-000011.txt : 20220...,<SEC-DOCUMENT>0001104659-22-045579.txt : 20220...,<SEC-DOCUMENT>0001193125-22-104336.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000009.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000002.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000030.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000026.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000022.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000018.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000016.txt : 20210...,...,<SEC-DOCUMENT>0001193125-13-235908.txt : 20130...,<SEC-DOCUMENT>0001193125-13-173923.txt : 20130...,<SEC-DOCUMENT>0001193125-13-151836.txt : 20130...,<SEC-DOCUMENT>0001193125-13-028189.txt : 20130...,<SEC-DOCUMENT>0001193125-12-518418.txt : 20121...,<SEC-DOCUMENT>0001193125-12-485763.txt : 20121...,<SEC-DOCUMENT>0001193125-12-435477.txt : 20121...,<SEC-DOCUMENT>0001193125-12-416835.txt : 20121...,<SEC-DOCUMENT>0001193125-12-316828.txt : 20120...,<SEC-DOCUMENT>0001193125-12-249407.txt : 20120...
documents,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>tm226241d...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d325485d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d542615d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d521900d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d511111d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d467316d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d459517d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d445039d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418261d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418265d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d370065d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d351330d8...


In [8]:
docs_10k = dict()

retype = re.compile(r'<TYPE>')

for key, value in documents.items():
    for idx, v in enumerate(value):
        start_idx = [x.end() for x in retype.finditer(v)][0]

        if v[start_idx:].split("\n")[0] == "8-K":
            docs_10k[key] = v


In [9]:
df2 = pd.DataFrame([docs_10k])
df2.index = ['docs_10k']

df4 = pd.concat([df3, df2])
df4.head()

Unnamed: 0,2022-04-28,2022-04-14,2022-04-13,2022-03-09,2022-02-03,2021-11-12,2021-10-28,2021-09-20,2021-07-29,2021-07-02,...,2013-05-24,2013-04-25,2013-04-12,2013-01-29,2012-12-28,2012-11-29,2012-10-25,2012-10-05,2012-07-26,2012-05-25
filing_data,<SEC-DOCUMENT>0001018724-22-000011.txt : 20220...,<SEC-DOCUMENT>0001104659-22-045579.txt : 20220...,<SEC-DOCUMENT>0001193125-22-104336.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000009.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000002.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000030.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000026.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000022.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000018.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000016.txt : 20210...,...,<SEC-DOCUMENT>0001193125-13-235908.txt : 20130...,<SEC-DOCUMENT>0001193125-13-173923.txt : 20130...,<SEC-DOCUMENT>0001193125-13-151836.txt : 20130...,<SEC-DOCUMENT>0001193125-13-028189.txt : 20130...,<SEC-DOCUMENT>0001193125-12-518418.txt : 20121...,<SEC-DOCUMENT>0001193125-12-485763.txt : 20121...,<SEC-DOCUMENT>0001193125-12-435477.txt : 20121...,<SEC-DOCUMENT>0001193125-12-416835.txt : 20121...,<SEC-DOCUMENT>0001193125-12-316828.txt : 20120...,<SEC-DOCUMENT>0001193125-12-249407.txt : 20120...
documents,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>tm226241d...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d325485d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d542615d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d521900d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d511111d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d467316d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d459517d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d445039d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418261d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418265d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d370065d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d351330d8...
docs_10k,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20220...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>tm226241d2...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d325485d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20220...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20220...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20211...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20211...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20210...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20210...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20210...,...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d542615d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d521900d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d511111d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d467316d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d459517d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d445039d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418261d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418265d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d370065d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d351330d8k...


In [10]:
docs_10k_clean = dict()

for key, data in docs_10k.items():
    x = data.lower()
    x = BeautifulSoup(x, 'html.parser').get_text()
    docs_10k_clean[key] = x

df2 = pd.DataFrame([docs_10k_clean])
df2.index = ['docs_10k_clean']

df5 = pd.concat([df4, df2])
df5.head()



Unnamed: 0,2022-04-28,2022-04-14,2022-04-13,2022-03-09,2022-02-03,2021-11-12,2021-10-28,2021-09-20,2021-07-29,2021-07-02,...,2013-05-24,2013-04-25,2013-04-12,2013-01-29,2012-12-28,2012-11-29,2012-10-25,2012-10-05,2012-07-26,2012-05-25
filing_data,<SEC-DOCUMENT>0001018724-22-000011.txt : 20220...,<SEC-DOCUMENT>0001104659-22-045579.txt : 20220...,<SEC-DOCUMENT>0001193125-22-104336.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000009.txt : 20220...,<SEC-DOCUMENT>0001018724-22-000002.txt : 20220...,<SEC-DOCUMENT>0001018724-21-000030.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000026.txt : 20211...,<SEC-DOCUMENT>0001018724-21-000022.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000018.txt : 20210...,<SEC-DOCUMENT>0001018724-21-000016.txt : 20210...,...,<SEC-DOCUMENT>0001193125-13-235908.txt : 20130...,<SEC-DOCUMENT>0001193125-13-173923.txt : 20130...,<SEC-DOCUMENT>0001193125-13-151836.txt : 20130...,<SEC-DOCUMENT>0001193125-13-028189.txt : 20130...,<SEC-DOCUMENT>0001193125-12-518418.txt : 20121...,<SEC-DOCUMENT>0001193125-12-485763.txt : 20121...,<SEC-DOCUMENT>0001193125-12-435477.txt : 20121...,<SEC-DOCUMENT>0001193125-12-416835.txt : 20121...,<SEC-DOCUMENT>0001193125-12-316828.txt : 20120...,<SEC-DOCUMENT>0001193125-12-249407.txt : 20120...
documents,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>tm226241d...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d325485d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2022...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-2021...,...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d542615d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d521900d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d511111d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d467316d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d459517d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d445039d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418261d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418265d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d370065d8...,[\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d351330d8...
docs_10k,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20220...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>tm226241d2...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d325485d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20220...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20220...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20211...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20211...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20210...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20210...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>amzn-20210...,...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d542615d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d521900d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d511111d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d467316d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d459517d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d445039d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418261d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d418265d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d370065d8k...,\n<TYPE>8-K\n<SEQUENCE>1\n<FILENAME>d351330d8k...
docs_10k_clean,\n8-k\n1\namzn-20220428.htm\n8-k\n\n\n\namzn-2...,\n8-k\n1\ntm226241d2_8k.htm\nform 8-k\n\n\n\n\...,\n8-k\n1\nd325485d8k.htm\n8-k\n\n\n\n\n\n8-k\n...,\n8-k\n1\namzn-20220309.htm\n8-k\n\n\n\namzn-2...,\n8-k\n1\namzn-20220203.htm\n8-k\n\n\n\namzn-2...,\n8-k\n1\namzn-20211110.htm\n8-k\n\n\n\namzn-2...,\n8-k\n1\namzn-20211028.htm\n8-k\n\n\n\namzn-2...,\n8-k\n1\namzn-20210920.htm\n8-k\n\n\n\namzn-2...,\n8-k\n1\namzn-20210729.htm\n8-k\n\n\n\namzn-2...,\n8-k\n1\namzn-20210628.htm\n8-k\n\n\n\namzn-2...,...,\n8-k\n1\nd542615d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd521900d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd511111d8k.htm\n8-k\n\n\n8-k\n\n\n \...,\n8-k\n1\nd467316d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd459517d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd445039d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd418261d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd418265d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd370065d8k.htm\nform 8-k\n\n\nform 8...,\n8-k\n1\nd351330d8k.htm\nform 8-k\n\n\nform 8...


In [11]:
dates = df5.columns

In [12]:
df5 = df5.T
df6=pd.DataFrame()
df6["raw_data"]=df5["docs_10k_clean"]

In [13]:
df6.head()

Unnamed: 0,raw_data
2022-04-28,\n8-k\n1\namzn-20220428.htm\n8-k\n\n\n\namzn-2...
2022-04-14,\n8-k\n1\ntm226241d2_8k.htm\nform 8-k\n\n\n\n\...
2022-04-13,\n8-k\n1\nd325485d8k.htm\n8-k\n\n\n\n\n\n8-k\n...
2022-03-09,\n8-k\n1\namzn-20220309.htm\n8-k\n\n\n\namzn-2...
2022-02-03,\n8-k\n1\namzn-20220203.htm\n8-k\n\n\n\namzn-2...


In [14]:
df6=preprocess_data.process(df6)
df6["final_cleaned"] = df6["cleaned_no_stop_punc_data"].apply(lambda x: ' '.join(s for s in x.split() if not any(c.isdigit() for c in s)))
df6.head()

Unnamed: 0,raw_data,cleaned_data,cleaned_no_stop_punc_data,final_cleaned
2022-04-28,\n8-k\n1\namzn-20220428.htm\n8-k\n\n\n\namzn-2...,8-k amzn-20220428.htm 8-k amzn-202204280001018...,8k amzn20220428htm 8k amzn202204280001018724fa...,of contentsunited statessecurities and exchang...
2022-04-14,\n8-k\n1\ntm226241d2_8k.htm\nform 8-k\n\n\n\n\...,8-k tm226241d2_8k.htm form 8-k false inc iso42...,8k tm226241d28khtm form 8k false inc iso4217us...,form false inc xbrlishares xbrlishares table o...
2022-04-13,\n8-k\n1\nd325485d8k.htm\n8-k\n\n\n\n\n\n8-k\n...,8-k d325485d8k.htm 8-k 8-k false united states...,8k d325485d8khtm 8k 8k false united states sec...,false united states securities and exchange co...
2022-03-09,\n8-k\n1\namzn-20220309.htm\n8-k\n\n\n\namzn-2...,8-k amzn-20220309.htm 8-k amzn-202203090001018...,8k amzn20220309htm 8k amzn202203090001018724fa...,of contentsunited statessecurities and exchang...
2022-02-03,\n8-k\n1\namzn-20220203.htm\n8-k\n\n\n\namzn-2...,8-k amzn-20220203.htm 8-k amzn-202202030001018...,8k amzn20220203htm 8k amzn202202030001018724fa...,of contentsunited statessecurities and exchang...


In [15]:
df6.shape

(97, 4)

In [16]:
data=pd.DataFrame()
data["dates"]=dates
data['data'] = df6['final_cleaned'].values

In [17]:
data.head()

Unnamed: 0,dates,data
0,2022-04-28,of contentsunited statessecurities and exchang...
1,2022-04-14,form false inc xbrlishares xbrlishares table o...
2,2022-04-13,false united states securities and exchange co...
3,2022-03-09,of contentsunited statessecurities and exchang...
4,2022-02-03,of contentsunited statessecurities and exchang...


In [18]:
data.to_csv('output/out.csv', index=False)