# Data 

In [1]:
!pip install sec-api
!pip install pydash
!pip install jsonlines



In [1]:
from sec_api import InsiderTradingApi
from sec_api import MappingApi
import pandas as pd
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pydash import get, flatten
import json
import jsonlines
from datetime import datetime, timedelta
import requests
import warnings
import os

warnings.filterwarnings("ignore")

## Data Gathering

### Datasets

Two different datasets were used for the time period 2018-2023. The close price return dataset contains the daily adjusted and unadjusted close prices from listed companies on US exchanges as well as the daily returns. The insider trades dataset contains the historical insider buy and sell transactions involving non derivatives transactions of all publicly listed companies on US exchanges from SEC API. In addition, we enrich the dataset by linking CIK identifiers of companies to their sector and industry classifications.

<hr>

### Close Price Return Dataset

#### Aquisition of the Close Price Return Dataset

Close price return data was provided as flat files on this server. This script automatically loads all data and save it as a csv file.

In [3]:
share_prices = {}
repo_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir)+ '/algoseek_gk/algoseek')
for issuerTicker in os.listdir(repo_path):
    try:
        with open(os.path.abspath(repo_path + '/'+ issuerTicker +'/_closing-prices.json')) as f:
            share_prices.update(json.load(f))
    except: 
        #print(issuerTicker + " does not contain the file _closing-prices.json")
        pass   

In [133]:
# save close prices for every ticker in csv file

df = pd.DataFrame.from_dict(share_prices, orient='index')
close_prices = df.set_index(df.index.str.split("-", expand = True ).map(lambda x: (x[0], pd.to_datetime(x[1]))))
close_prices.to_csv('close_prices.csv')

Unnamed: 0,Unnamed: 1,adjClosePrice,unadjClosePrice
GORO,2013-07-09,3.8918,8.449
GORO,2013-07-18,3.5192,7.64
GORO,2013-07-22,3.6850,8.0
GORO,2013-07-23,3.8646,8.39
GORO,2013-07-24,3.6527,7.93
...,...,...,...
TWNK,2023-10-31,33.4000,33.4
TWNK,2023-11-01,33.3400,33.34
TWNK,2023-11-02,33.3700,33.37
TWNK,2023-11-03,33.3500,33.35


In [2]:
# load csv file with close prices and adjust data

data = pd.read_csv('close_prices.csv')
data = data.rename(columns={'Unnamed: 0': 'issuerTicker', 'Unnamed: 1': 'Date'})
data = data.set_index(['issuerTicker', 'Date'])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,adjClosePrice,unadjClosePrice
issuerTicker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
GORO,2013-07-09,3.8918,8.449
GORO,2013-07-18,3.5192,7.640
GORO,2013-07-22,3.6850,8.000
GORO,2013-07-23,3.8646,8.390
GORO,2013-07-24,3.6527,7.930
...,...,...,...
TWNK,2023-10-31,33.4000,33.400
TWNK,2023-11-01,33.3400,33.340
TWNK,2023-11-02,33.3700,33.370
TWNK,2023-11-03,33.3500,33.350


Calcute the daily returns of the adjusted close prices for each ticker and save the result as csv file.

In [319]:
#create close_prices_returns.csv
'''
data = pd.read_csv('close_prices.csv', parse_dates = ["Unnamed: 1"])
data.rename(columns={'Unnamed: 0': 'issuerTicker', 'Unnamed: 1': 'Date'}, inplace = True)
data = data.set_index(['issuerTicker', 'Date'])
data.sort_index(inplace= True)
data["returns"] = data["adjClosePrice"]/data["adjClosePrice"].shift(1) -1 #incorrect: first date of each ticker contains false return
first_dates = data.groupby(level=0).apply(lambda x: x.index.get_level_values(1).min())
data.drop(pd.MultiIndex.from_arrays([first_dates.index.values, first_dates.values]), axis=0, inplace=True)
data.to_csv("close_prices_returns.csv")
'''

'\ndata = pd.read_csv(\'close_prices.csv\', parse_dates = ["Unnamed: 1"])\ndata.rename(columns={\'Unnamed: 0\': \'issuerTicker\', \'Unnamed: 1\': \'Date\'}, inplace = True)\ndata = data.set_index([\'issuerTicker\', \'Date\'])\ndata.sort_index(inplace= True)\ndata["returns"] = data["adjClosePrice"]/data["adjClosePrice"].shift(1) -1 #incorrect: first date of each ticker contains false return\nfirst_dates = data.groupby(level=0).apply(lambda x: x.index.get_level_values(1).min())\ndata.drop(pd.MultiIndex.from_arrays([first_dates.index.values, first_dates.values]), axis=0, inplace=True)\ndata.to_csv("close_prices_returns.csv")\n'

In [3]:
# read close_prices_returns.csv and adjust the data

returns = pd.read_csv('close_prices_returns.csv')
returns = returns.set_index(['issuerTicker', 'Date'])
returns = returns.reset_index().dropna().set_index(['issuerTicker', 'Date'])
returns = pd.DataFrame(returns) 
returns

Unnamed: 0_level_0,Unnamed: 1_level_0,adjClosePrice,unadjClosePrice,returns
issuerTicker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2013-05-22,30.4890,46.2400,-0.010910
A,2013-05-23,30.1593,45.7400,-0.010814
A,2013-05-29,30.3776,46.0710,0.007238
A,2013-06-05,29.3615,44.5300,-0.033449
A,2013-06-13,29.3417,44.5000,-0.000674
...,...,...,...,...
ZYXI,2023-11-22,9.0800,9.0800,0.049711
ZYXI,2023-11-24,9.4301,9.4301,0.038557
ZYXI,2023-12-06,8.3900,8.3900,-0.110296
ZYXI,2023-12-13,9.5000,9.5000,0.132300


### Insider Trades Dataset

#### Aquisition of the Insider Trades Dataset

Insider trades data was obtained from SEC API. This script automatically loads all data.

#### Preparation of the Insider Trades Dataset



To gather the data, the data was first flattened. The flatten_filing() function extracts data points from the filing such as all non-derivative transactions, the period of report (=date of transaction), issuer CIK, issuer ticker, reporting person’s name, reporting person’s CIK, and their relationship. It then creates a list called transactions, which will contain all the extracted insider trades.

The total $ amount of the transaction is calculated by simply multiplying the number of shares with the price per transaction. It then merges the extracted data points with the base_data and appends them to the transactions list. 

Every item in the transactions list represents a single insider trading disclosure (Form 3/4/5) including all buy/sell transactions and meta data (details about reporting person, company, etc.) and is formatted as a Python dictionary.

A simple JSON-NL (new line) formatted text file is used to store all insider transactions. This format makes it easier to start, stop and continue downloading transactions as the data parser doesn’t need to load the entire text file, but rather processes each line at a time.

In [4]:
insiderTradingApi = InsiderTradingApi("ed8a46ea85edd76eb7c5cb066b006266d3cd56e4dca2ddd6b511faa6c4219896")

In [5]:
# structure the response data and convert it into a pandas dataframe

def flatten_filing(filing):
  # input : JSON formatted filing from the SEC EDGAR system that contains insider trading information
  # return: list called transactions, which will contain all the extracted insider trades that are Non-Derivative

  transactions = []

  # data points to be added to each transaction
  try:
    base_data = {
        "periodOfReport": filing["periodOfReport"],
        "issuerCik": filing["issuer"]["cik"],
        "issuerTicker": filing["issuer"]["tradingSymbol"],
        "reportingPersonName": get(filing, "reportingOwner.name", ""),
        "reportingPersonCik": get(filing, "reportingOwner.cik", ""),
        "relationship": get(filing, "reportingOwner.relationship", {})
    }
  except Exception as e:
    print(f'{filing["id"]}, caught {type(e)}: {e}')
    return transactions


  if "nonDerivativeTable" in filing and "transactions" in filing["nonDerivativeTable"]:
    # extract the data points of interest from each transaction
    for transaction in filing["nonDerivativeTable"]["transactions"]:
      sharePrice = get(transaction, "amounts.pricePerShare", 0)
      sharesOwnedFollowingTransaction = get(transaction, "postTransactionAmounts.sharesOwnedFollowingTransaction", 0)

      entry = {
          "type": "nonDerivative",
          "securityTitle": transaction["securityTitle"],
          "codingCode": transaction["coding"]["code"],
          "acquiredDisposed": transaction["amounts"]["acquiredDisposedCode"],
          "shares": transaction["amounts"]["shares"],
          "sharePrice": sharePrice,
          "total": math.ceil(transaction["amounts"]["shares"] * sharePrice),
          "sharesOwnedFollowingTransaction": sharesOwnedFollowingTransaction
      }

      # merge base_data and entry into a new dict and append to transactions
      transactions.append({**base_data, **entry})

  return transactions


# convert filings into a pandas dataframe
def flatten_filings(filings):
  unflattened_list = list(map(flatten_filing, filings))
  return [item for sublist in unflattened_list for item in sublist]

The function download_and_save_trades_per_year() downloads and saves all insider trades for a specified year in a log file named trades_[year].txt. It takes a parameter of year which defaults to 2023 and opens or creates the log file in append mode. The function then enters a while loop that fetches new insider disclosures with the get_data() method to get data from the specified day in batches of 50 transactions at a time. If there are no more transactions for that day, it increments the day by one and checks if it is equal to the end of the year. If so, it breaks out of the loop and closes the output file; otherwise, it continues on with getting new data for that day.

In [6]:
# download and save all insider trades for a specified year in a log file named trades_[year].txt

def download_and_save_trades_per_year(year="2023"):
  log_file = open(f"trades_{year}.txt", "a")

  has_data = True
  start_from = 0
  total_filings_saved = 0
  last_count = 0
  date_format = "%Y-%m-%d"
  day = year + "-01-01"

  # fetch new insider disclosures with the get_data() method to get data from the specified day in batches of 50 transactions at a time

  while has_data:
    insider_trades = insiderTradingApi.get_data({
      "query": {"query_string": {"query": f"periodOfReport:{day} AND issuer.tradingSymbol:*"}},
      "from": start_from,
      "size": "50",
      "sort": [{ "filedAt": { "order": "desc" } }]
    })

    if len(insider_trades["transactions"]) == 0:
      start_from = 0
      day_date = datetime.strptime(day, date_format) + timedelta(days=1)
      day = day_date.strftime(date_format)
      print(f'-- {day} --')

      if day == year + "-12-31":
        break

      continue

    total_filings_saved += len(insider_trades["transactions"])
    start_from += 50

    trades = flatten_filings(insider_trades["transactions"])

    for trade in trades:
      log_file.write(json.dumps(trade) + '\n')

    if total_filings_saved > last_count + 500:
      last_count = total_filings_saved
      print(f'{total_filings_saved } saved')

  log_file.close()
  print(f'{year} done. {total_filings_saved} total filings saved')

# uncomment line below to download all transactions for 2023
# download_and_save_trades_per_year(year="2023")

The following code converts the JSON-NL transaction objects into a pandas dataframe. For each transaction, it ensures all the tickers in the issuerTicker column are upper-cased and have no whitespace and converts the periodOfReport column to a datetime format.

After that all transactions from 2018-2023 are merged into one giant dataframe. 

In [14]:
# save insider trades per year in csv files

for year in range(2018, 2023) :
  trades = []

  with jsonlines.open(f"trades_{year}.txt", "r") as reader:
    for trade in reader:
      trades.append(trade)

  trades_df = pd.DataFrame(trades)

  #trades_df.drop('filingId', axis=1, inplace=True)

  trades_df["issuerTicker"] = trades_df["issuerTicker"].apply(lambda x : x.upper().replace(" ", ""))

  trades_df['periodOfReport'] = pd.to_datetime(trades_df['periodOfReport'])

  trades_df.to_csv(f'trades_{year}.csv')

In [7]:
all_trades = pd.DataFrame()

for year in range(2018, 2024):
    trades_per_year = pd.read_csv(f"trades_{year}.csv", low_memory=False)
    
    all_trades = pd.concat([all_trades, trades_per_year])

all_trades['periodOfReport'] = pd.to_datetime(all_trades['periodOfReport'])

all_trades = all_trades.drop(columns=['Unnamed: 0'])

all_trades = all_trades.astype({"issuerCik": str})

## Data Cleaning

Next, we focus on cleaning the transactions. In particular, we remove transactions that meet any of the following criteria:

1. Number of shares = share price
2. Share price > 6000 and number of shares is not 1
3. Total amount per transaction < $1
4. Transaction code is M representing the exercise or conversion of derivative security exempted pursuant to Rule 16b-3
5. Incorrect ticker, e.g. “NONE”, “N/A”
6. Incorrect reporter with CIK matching 810893, 1454510, etc.

In [8]:
# clean data by removing transactions that don't meet any of the following criteria

filter_all = (all_trades["shares"] != all_trades["sharePrice"]) & \
  ( (all_trades["sharePrice"] < 6000) | (all_trades["shares"] == 1) ) & \
  (all_trades["total"] != 0) & \
  (all_trades["codingCode"] != "M") & \
  (all_trades["issuerTicker"] != "NONE") & \
  (all_trades["issuerTicker"] != "N/A") & \
  (all_trades["issuerTicker"] != "NA") & \
  (~all_trades["issuerCik"].str.contains("810893|1454510|1463208|1877939|1556801|827187")) # insider incorrectly reported share price

all_trades = all_trades[filter_all]

all_trades.head(10)

Unnamed: 0,periodOfReport,issuerCik,issuerTicker,reportingPersonName,reportingPersonCik,relationship,type,securityTitle,codingCode,acquiredDisposed,shares,sharePrice,total,sharesOwnedFollowingTransaction
4,2018-01-01,894871,MVEN,Mills Peter B,1376483,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,F,D,9659.0,0.44,4250,383125.0
11,2018-01-01,894871,MVEN,Mills Peter B,1376483,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,F,D,9659.0,0.44,4250,263009.0
15,2018-01-01,1559865,EVTC,Schuessler Morgan M,1407607,"{'isDirector': True, 'isOfficer': True, 'offic...",nonDerivative,Common Stock,S,D,28948.0,31.28,905494,308928.0
37,2018-01-01,1416090,IMII,CLUFF WHITNEY O,1471006,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,P,A,1.0,0.1,1,441911.0
38,2018-01-01,1416090,IMII,CLUFF WHITNEY O,1471006,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,P,A,14686.0,0.18,2644,456597.0
40,2018-01-01,1416090,IMII,CLUFF WHITNEY O,1471006,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,P,A,10000.0,0.15,1500,491597.0
41,2018-01-01,1416090,IMII,CLUFF WHITNEY O,1471006,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,P,A,7500.0,0.14,1050,499097.0
42,2018-01-01,1416090,IMII,CLUFF WHITNEY O,1471006,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,P,A,12500.0,0.14,1751,511597.0
43,2018-01-01,1416090,IMII,CLUFF WHITNEY O,1471006,"{'isDirector': True, 'isOfficer': False, 'isTe...",nonDerivative,Common Stock,P,A,25000.0,0.19,4750,536597.0
45,2018-01-01,1691303,HCC,Boyles Dale W,1419071,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,1087.0,24.11,26208,36477.0


Incorrectly reported prices seem to happen mostly with OTC stocks. In order to filter out such transactions, we add the corresponding exchange of the ticker to all transactions and then remove transactions without a valid exchange (in this case: NYSE and NASDAQ to consider most of the publicy listed companies on US exchanges)

In [9]:
mappingApi = MappingApi("ed8a46ea85edd76eb7c5cb066b006266d3cd56e4dca2ddd6b511faa6c4219896")

In [10]:
# get all tickers traded on the NYSE and NASDAQ exchange to consider most of the publicly listed companies on US exchanges 

all_nasdaq_listings_json = mappingApi.resolve('exchange', 'NASDAQ')
all_nyse_listings_json = mappingApi.resolve('exchange', 'NYSE')

all_nasdaq_listings = pd.DataFrame(all_nasdaq_listings_json)
all_nyse_listings = pd.DataFrame(all_nyse_listings_json)

The load_ticker_meta_data() function merges the two dataframes, removes columns we don’t need, and returns a single dataframe including the listed companies on NYSE and NASDAQ.

In [11]:
# remove transactions without a valid exchange (in this case NYSE and NASDAQ)
# reason: incorrectly reported prices seem to happen mostly with Over-the-counter stocks

# merge mapping files, remove unnecessary columns, and return a single dataframe including all listed companies

def load_ticker_meta_data():

  all_nyse_listings.drop(["cusip","sic","famaSector","famaIndustry","id", "currency", "location"], axis=1, inplace=True)
  all_nasdaq_listings.drop(["cusip","sic","famaSector","famaIndustry","id", "currency", "location"], axis=1, inplace=True)

  all_nyse_listings.rename(columns={'ticker': 'issuerTicker'}, inplace=True)
  all_nasdaq_listings.rename(columns={'ticker': 'issuerTicker'}, inplace=True)

  return pd.concat([all_nyse_listings, all_nasdaq_listings])


ticker_meta_data = load_ticker_meta_data()
ticker_meta_data.head(5)

Unnamed: 0,name,issuerTicker,cik,exchange,isDelisted,category,sector,industry,sicSector,sicIndustry
0,AGILENT TECHNOLOGIES INC,A,1090872,NYSE,False,Domestic Common Stock,Healthcare,Diagnostics & Research,Manufacturing,Laboratory Analytical Instruments
1,ALCOA CORP,AA,1675149,NYSE,False,Domestic Common Stock,Basic Materials,Aluminum,Manufacturing,Primary Production Of Aluminum
2,ALTANA AKTIENGESELLSCHAFT,AAAGY,1182802,NYSE,True,ADR Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
3,ARES ACQUISITION CORP,AAC,1829432,NYSE,True,Domestic Common Stock Primary Class,Industrials,Shell Companies,Finance Insurance And Real Estate,Blank Checks
4,ARCADIA FINANCIAL LTD,AAC1,879674,NYSE,True,Domestic Common Stock,Financial Services,Asset Management,Finance Insurance And Real Estate,Short-Term Business Credit Institutions


Next, we merge the exchange data with our insider transactions and drop all transactions without an exchange (NYSE and NASDAQ).

In [12]:
# merge the exchange data with the insider transactions and drop all transactions without an exchange (in this case NYSE and NASDAQ)

insider_trades = all_trades.merge(ticker_meta_data,
                              on="issuerTicker",
                              how="left", suffixes=(None,None))

insider_trades = insider_trades[insider_trades["exchange"].notna()]

insider_trades.head(10)

Unnamed: 0,periodOfReport,issuerCik,issuerTicker,reportingPersonName,reportingPersonCik,relationship,type,securityTitle,codingCode,acquiredDisposed,...,sharesOwnedFollowingTransaction,name,cik,exchange,isDelisted,category,sector,industry,sicSector,sicIndustry
2,2018-01-01,1559865,EVTC,Schuessler Morgan M,1407607,"{'isDirector': True, 'isOfficer': True, 'offic...",nonDerivative,Common Stock,S,D,...,308928.0,EVERTEC INC,1559865,NYSE,False,Domestic Common Stock,Technology,Software - Infrastructure,Services,Services-Computer Processing & Data Preparation
9,2018-01-01,1691303,HCC,Boyles Dale W,1419071,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,36477.0,WARRIOR MET COAL INC,1691303,NYSE,False,Domestic Common Stock,Basic Materials,Coking Coal,Mining,Bituminous Coal & Lignite Mining
10,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,39911.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
11,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,39911.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
12,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,39911.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
13,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,39911.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
14,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,24541.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
15,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,24541.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
16,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,24541.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations
17,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,24541.0,INTERCEPT PHARMACEUTICALS INC,1270073,NASDAQ,True,Domestic Common Stock,Healthcare,Biotechnology,Manufacturing,Pharmaceutical Preparations


Next, we enrich the dataset by linking CIK identifiers of companies to their sector and industry classifications.

The SEC API doesn’t use tickers to recognize companies but uses the CIK which is a unique number assigned to US companies. The get_ticker_cik() function create a dataframe to map the ticker to the CIK and vice-versa.

In [58]:
# get ticker and cik information from the following url and save the information in pandas dataframes

def get_ticker_cik():
    #make a request to get the json
    for attempt in range(10):
        try:
            res=requests.get("https://www.sec.gov/files/company_tickers.json", headers={"User-Agent":"Java-http-client/"})
            
            if res.status_code == 200:
                data = res.json()
            else:
                print(f"Error: Received status code {res.status_code}")
        
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
    
    #create a df with columns: "cik_str","tiker","title"
    df=pd.DataFrame.from_dict(data, orient="index")
    #create a df that maps the cik to the ticker
    fromCik=pd.DataFrame([df["ticker"].values],columns=df["cik_str"].values)
    #create a df that maps the ticker to the cik
    fromTicker=pd.DataFrame([df["cik_str"].values],columns=df["ticker"].values)
    return fromCik, fromTicker


fromCik, fromTicker = get_ticker_cik()

Error: Received status code 403
Error: Received status code 403


Save the mapping data in txt file as this data is time consuming to generate.

In [435]:
# uncomment to map cik to company details

# all_ciks_mapping = []

# for index, cik in enumerate(fromCik):
#         try:
#             details = mappingApi.resolve('cik', str(cik))
#             all_ciks_mapping.append(details)
#         except Exception as e:
#             print(f"Failed to resolve CIK {cik}: {e}")

# save results in txt file

# with open('cik_mapping.txt', 'w') as f:
#     for element in all_ciks_mapping:
#         f.write(json.dumps(element) + '\n')

Read the mapping data and convert the data into a dataframe.

In [60]:
cik = []

with jsonlines.open("cik_mapping.txt") as reader:
    for element in reader:
      cik.append(element)

In [61]:
# convert the information from the cik mapping into a pandas dataframe

columns = pd.json_normalize(cik[0])
columns = columns.columns.values

company_details = pd.DataFrame(columns=columns, index=range(len(cik)))

for index, element in enumerate(cik) :
  if len(pd.json_normalize(element).iloc[0]) == 17: # filter out entries with missing information
         company_details.iloc[index] = pd.json_normalize(element).iloc[0] # iloc[0] in case there are multiple entries with the same cik

In [62]:
company_details.head(10)

Unnamed: 0,name,ticker,cik,cusip,exchange,isDelisted,category,sector,industry,sic,sicSector,sicIndustry,famaSector,famaIndustry,currency,location,id
0,MICROSOFT CORP,MSFT,789019,594918104,NASDAQ,False,Domestic Common Stock,Technology,Software - Infrastructure,7372,Services,Services-Prepackaged Software,,Business Services,USD,Washington; U.S.A,0f08a6a6742dc4148badfef6977406cf
1,APPLE INC,AAPL,320193,037833100,NASDAQ,False,Domestic Common Stock,Technology,Consumer Electronics,3571,Manufacturing,Electronic Computers,,Computers,USD,California; U.S.A,a43c3ffca9b4a0be9cee4fa1120832a2
2,NVIDIA CORP,NVDA,1045810,67066G104,NASDAQ,False,Domestic Common Stock,Technology,Semiconductors,3674,Manufacturing,Semiconductors & Related Devices,,Electronic Equipment,USD,California; U.S.A,4a73b69083f93d38e05e0b76219875c9
3,ALPHABET INC,GOOGL,1652044,02079K305 38259P508,NASDAQ,False,Domestic Common Stock Primary Class,Communication Services,Internet Content & Information,7370,Services,Services-Computer Programming Data Processing ...,,Business Services,USD,California; U.S.A,f4d5c493c7fe5a85bcd98005175b18bb
4,AMAZON COM INC,AMZN,1018724,023135106,NASDAQ,False,Domestic Common Stock,Consumer Cyclical,Internet Retail,5961,Retail Trade,Retail-Catalog & Mail-Order Houses,,Retail,USD,Washington; U.S.A,2d771e9bff7a710c52906bf5373c73bf
5,META PLATFORMS INC,META,1326801,30303M102,NASDAQ,False,Domestic Common Stock,Communication Services,Internet Content & Information,7370,Services,Services-Computer Programming Data Processing ...,,Business Services,USD,California; U.S.A,f0092f90a39a6015d08f1bcf9447a466
6,BERKSHIRE HATHAWAY INC,BRK.B,1067983,084670702 084670207,NYSE,False,Domestic Common Stock Primary Class,Financial Services,Insurance - Diversified,6331,Finance Insurance And Real Estate,Fire Marine & Casualty Insurance,,Insurance,USD,Nebraska; U.S.A,7b2698313c97ece6f2a4b65d83b4e430
7,TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD,TSM,1046179,874039100,NYSE,False,ADR Common Stock Primary Class,Technology,Semiconductors,3674,Manufacturing,Semiconductors & Related Devices,,Electronic Equipment,TWD,Taiwan,bb348a47b6beb63735aabf7a3468f798
8,ELI LILLY & CO,LLY,59478,532457108,NYSE,False,Domestic Common Stock,Healthcare,Drug Manufacturers - General,2834,Manufacturing,Pharmaceutical Preparations,,Pharmaceutical Products,USD,Indiana; U.S.A,fc255650f58e05396362dea927f8278e
9,BROADCOM INC,AVGO,1730168,11135F101 Y09827109 Y0486S104,NASDAQ,False,Domestic Common Stock Primary Class,Technology,Semiconductors,3674,Manufacturing,Semiconductors & Related Devices,,Electronic Equipment,USD,California; U.S.A,4e7a386c304e58a18f4d769aec92a526


Merge the information from the mapping with the insider transactions and save the data as a csv file.

In [63]:
# merge the information from the cik mapping with the insider transactions

company_details = company_details.rename(columns={'cik': 'issuerCik'})

trades = insider_trades.merge(company_details,
                              on="issuerCik",
                              how="left")

trades.head(10)

Unnamed: 0,periodOfReport,issuerCik,issuerTicker,reportingPersonName,reportingPersonCik,relationship,type,securityTitle,codingCode,acquiredDisposed,...,sector_y,industry_y,sic,sicSector_y,sicIndustry_y,famaSector,famaIndustry,currency,location,id
0,2018-01-01,1559865,EVTC,Schuessler Morgan M,1407607,"{'isDirector': True, 'isOfficer': True, 'offic...",nonDerivative,Common Stock,S,D,...,Technology,Software - Infrastructure,7374.0,Services,Services-Computer Processing & Data Preparation,,Business Services,USD,Puerto Rico,68e16c0b5b9dc20693b70438dda5c1e5
1,2018-01-01,1691303,HCC,Boyles Dale W,1419071,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,Basic Materials,Coking Coal,1220.0,Mining,Bituminous Coal & Lignite Mining,,Coal,USD,Alabama; U.S.A,1529f9a554428770fa4839bd4fada696
2,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,
3,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,
4,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,
5,2018-01-01,1270073,ICPT,Shapiro David,1559570,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,
6,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,
7,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,
8,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,
9,2018-01-01,1270073,ICPT,Kapadia Sandip,1677141,"{'isDirector': False, 'isOfficer': True, 'offi...",nonDerivative,Common Stock,F,D,...,,,,,,,,,,


In [27]:
# insider trades in csv file

trades.to_csv('insider_trades.csv')

In [70]:
trades.describe()

Unnamed: 0,periodOfReport,reportingPersonCik,shares,sharePrice,sharesOwnedFollowingTransaction
count,1269797,1269797.0,1269797.0,1269797.0,1269797.0
mean,2020-11-28 21:56:23.721649664,1491898.0,90048.34,201.3359,2430521.0
min,2018-01-01 00:00:00,4447.0,0.0,0.0,0.0
25%,2019-05-23 00:00:00,1263270.0,447.3292,14.06,14916.0
50%,2020-12-21 00:00:00,1534094.0,2000.0,39.49,60290.0
75%,2022-03-25 00:00:00,1706865.0,8897.0,92.33,280763.0
max,2023-12-30 00:00:00,2009104.0,3150000000.0,32900000.0,14725210000.0
std,,259495.6,4627008.0,29627.97,29427280.0
