# <h1 style='font-size:3rem;color:orange;'>IPO Prospectus Correlation with 1st-day-returns</h1>

#### Instructions for usage: 
##### Please replace all occurences of '/Users/kwanw4/Documents' to your local directory path. 
##### Please download the data files from here: https://www.dropbox.com/sh/xpt7tbc5xtbjh5l/AAB2R0Q-CKHjXh-TTtDIz_iba?dl=0 and store them in the same local directory path.

## 1. Install packages

In [None]:
#pip install -U sec-edgar-downloader
#pip install transformers
#conda install -c anaconda nltk
#conda install -c pytorch pytorch
#conda install -c huggingface transformers

## 2. Import Libraries

In [1]:
import pandas as pd
import numpy as np

import os
import re
import glob
import torch
import requests
from decimal import Decimal
from bs4 import BeautifulSoup
from sec_edgar_downloader import Downloader
import unicodedata

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## 3. Download datasets | Stock Price Data | SEC Prospectus Data

### 3.1 Scrap IPO data of the past 12 months

In [9]:
# Get data source
url = 'https://www.iposcoop.com/last-12-months'
page = requests.get(url).text

# Obtain page's information with BeautifulSoup
soup = BeautifulSoup(page,'lxml')

#Obtain table data using a selector in the base HTML
table = soup.select_one('.standard-table.ipolist')
IPO_data = pd.read_html(str(table))[0]

#Reformat IPO stock price data
IPO_data['Offer Price'] = IPO_data['Offer Price'].replace('[$,]', '', regex=True).astype('float')
IPO_data['1st Day Close'] = IPO_data['1st Day Close'].replace('[$,]', '', regex=True).astype('float')
IPO_data['Current Price'] = IPO_data['Current Price'].replace('[$,]', '', regex=True).astype('float')

#Calculate 1st-day-return
IPO_data['1st-day-return'] = (IPO_data['1st Day Close']-IPO_data['Offer Price'])/IPO_data['Offer Price']

#Calculate holding-period-return assuming holding period is from Offer Date to current date
IPO_data['holding-period-return'] = (IPO_data['Current Price']-IPO_data['Offer Price'])/IPO_data['Offer Price']
IPO_data = IPO_data.drop(columns=['Return','SCOOP Rating'])

#Generate data file
IPO_data.to_csv('IPO_data.csv')

In [10]:
#Display data
IPO_data.head()

Unnamed: 0,Company,Symbol,Industry,Offer Date,Shares (millions),Offer Price,1st Day Close,Current Price,1st-day-return,holding-period-return
0,Akanda Corp.,AKAN,Health Care,3/15/2022,4.0,4.0,10.5,8.04,1.625,1.01
1,"The Marygold Companies, Inc. (aka Concierge Te...",MGLD,Financials,3/10/2022,1.7,2.0,2.17,1.87,0.085,-0.065
2,"Blue Water Vaccines, Inc.",BWV,Health Care,2/18/2022,2.2,9.0,57.4,46.47,5.377778,4.163333
3,Meihua International Medical Technologies,MHUA,Health Care,2/16/2022,3.6,10.0,12.92,7.41,0.292,-0.259
4,"Vivakor, Inc.",VIVK,Oil & Gas,2/14/2022,1.6,5.0,4.35,2.68,-0.13,-0.464


### 3.2 Download the prospectuses of the IPO companies

In [None]:
#Specify directory to store files
dl = Downloader('/Users/kwanw4/Documents/ipo')

#Load in all tickers 
equity_ids = IPO_data['Symbol'].tolist()

#Loop through each ticker symbol and download its 424B4 form
for equity_id in equity_ids:
        dl.get('424B4', equity_id)

# 4. Data Processing

### 4.1 Parse Original Prospectus html file

In [None]:
#Initialize dataframe for storing textual data
text_data = pd.DataFrame(columns=['Symbol','original_text'])

#Loop through all downloaded html prospectus files
for filename in glob.iglob('/Users/kwanw4/Documents/ipo/**/*.html',recursive = True):
    #Slice filename string to get ticker symbol
    #eg. if filename ='/Users/Documents/ipo/sec-edgar-filings/AREN/424B4/01452/filing-details.html', Symbol = 'AREN'
    delim_front = filename.find('sec-edgar-filings/')+18
    delim_end = filename.find('/424B4/')
    Symbol= (filename[delim_front:delim_end])
    
    #Parse the html file
    with open(str(filename)) as fp:
        soup = BeautifulSoup(fp, 'html.parser')
        Textual = (soup.get_text())
        
        #Initial text pre-processing
        Textual = Textual.strip()
        Textual = Textual.replace('•', '.')
        Textual = Textual.replace('●', '.')
        Textual = Textual.replace('$', '.')
        Textual = os.linesep.join([s for s in Textual.splitlines() if s])
        Textual = Textual.replace('\n', ' ').replace('\r', '')
        Textual = Textual.replace(u'\xa0', u' ')
        Textual = re.sub(' +', ' ', Textual)
        
    #Store the symbol and the processed text into the dataframe
    new_row = {'Symbol':Symbol, 'original_text':Textual}
    text_data = text_data.append(new_row, ignore_index=True)

#Drop duplicated rows signifying reused tickers
text_data = text_data.drop_duplicates('Symbol', keep =False)

#Save text_data as string
text_data['original_text'].astype(str)

### 4.2 Filter and Tokenize Forward-Looking Statements (FLS)

In [None]:
#Establish Forward-looking Statements (FLS) keywords
FLS_keywords = ['may', 'might', 'will', 'should', 'believe', 'expect', 'could', 
                'would', 'plan', 'planned', 'planning' 'anticipate', 'estimate', 
                'contine', 'predict', 'project', 'potential', 'target', 'goal', 
                'accelerate', 'await', 'coming', 'confidence', 'confident', 'estimated', 
                'likely', 'unlikely', 'forecast', 'forecasted', 'predict', 'predicted', 
                'hope', 'intend', 'intention', 'outlook', 'optimistic', 'look ahead', 
                'look forward', 'soon', 'ahead', 'envisage','prospect', 'eventual',
                'risk', 'uncertain', 'future']

#Create empty dataframe column to store cleansed text
text_data['cleansed_text'] = ''

#Iterate through each ticker
for i, row in text_data.iterrows():
    input_file = sent_tokenize(text_data.at[i,'original_text'])
    output_file = []
    
    for sent in input_file:
        
        # tokenize and lowercase tokens of the sentence
        tokenized_sent = [word.lower() for word in word_tokenize(sent)]
        
        # if any item in the tokenized sentence is a keyword, append the original sentence
        if any(keyw in tokenized_sent for keyw in FLS_keywords):
            output_file.append(sent)
    
    #convert ouput_file to string
    output_file =' '.join(output_file)

    #store the string text in new column
    text_data.at[i,'cleansed_text'] = output_file

#Delete original text column
text_data = text_data.drop(columns=['original_text'])
text_data.to_csv('text_data.csv')

In [3]:
#Display data
text_data.head()

Unnamed: 0.1,Unnamed: 0,Symbol,cleansed_text
0,0,MDV,"With .47.3 million of cash as of January 31, 2..."
1,1,BRZE,We will not receive any proceeds from the sale...
2,2,HRT,Our common stock has been approved for listing...
3,3,TCBX,Our common stock has been approved for listing...
4,4,FLNC,Our Class A common stock has been approved for...


### 4.3 Finalize Dataframe for Analysis

In [11]:
#Prepare dataframe for analysis
df = pd.merge(IPO_data,text_data, on='Symbol')
df = df[['Symbol','1st-day-return','cleansed_text','holding-period-return']]

#Ouput merged data to csv
df.to_csv('complete_data.csv')

In [12]:
#Display data
df.head()

Unnamed: 0,Symbol,1st-day-return,cleansed_text,holding-period-return
0,AKAN,1.625,See “Prospectus Summary — Implications of Bein...,1.01
1,BWV,5.377778,We are an “emerging growth company” as that te...,4.163333
2,MHUA,0.292,"Upon the completion of this offering, we will ...",-0.259
3,VIVK,-0.13,Our Common Stock was previously quoted on the ...,-0.464
4,MDV,1.72,"With .47.3 million of cash as of January 31, 2...",-0.3076


# 5. Data Modelling

### 5.1 FinBERT - A Pretrained Language Model for Financial Communications

In [4]:
df = pd.read_csv('/Users/kwanw4/Documents/complete_data.csv', index_col=[0])

#Convert dataframe to array
dataframe_array = np.array(df)
np.random.shuffle(dataframe_array)
Symbol = list(dataframe_array[:,1])
first_day_return = list(dataframe_array[:,2])
textual_list = list(dataframe_array[:,3])
holding_period_return = list(dataframe_array[:,4])

#Define model
tokenizer=AutoTokenizer.from_pretrained('ProsusAI/finbert')
model=AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')

#Transform input tokens
inputs=tokenizer(textual_list,padding=True,truncation=True,return_tensors='pt')

#Apply model
outputs = model(**inputs)

#Define predictions outputs
predictions = torch.nn.functional.softmax(outputs.logits,dim=-1)
positive = predictions[:,0].tolist()
negative = predictions[:,1].tolist()
neutral = predictions[:,2].tolist()

#Convert outputs to dataframe
table={'Text':textual_list,
       'Symbol':Symbol,
       '1st-day-return':first_day_return,
       'Holding-period-return':holding_period_return,
       'Positive':positive,
       'Negative':negative,
       'Neutral':neutral}

FinBERT_df = pd.DataFrame(table, columns = ['Symbol', '1st-day-return', 'Holding-period-return',
                                            'Positive', 'Negative', 'Neutral'])

#Export results to .csv file
FinBERT_df.to_csv('FinBERT_data.csv')

#Display data
FinBERT_df.head()

Unnamed: 0,Symbol,1st-day-return,Holding-period-return,Positive,Negative,Neutral
0,SEV,1.546667,-0.577333,0.026042,0.025654,0.948303
1,FLNC,0.244286,-0.491429,0.036628,0.013713,0.949659
2,WBEV,-0.056154,-0.716154,0.023628,0.028828,0.947544
3,REFI,0.030625,0.0975,0.024967,0.028532,0.946501
4,CIAN,0.10125,-0.7875,0.024464,0.026286,0.94925


In [3]:
#Evaluate the model
df = pd.read_csv('/Users/kwanw4/Documents/FinBERT_data.csv', index_col=[0])

#Display correlation betweeen sentiment scores and returns
print(df['1st-day-return'].corr(df['Positive']))
print(df['1st-day-return'].corr(df['Negative']))
print(df['Holding-period-return'].corr(df['Positive']))
print(df['Holding-period-return'].corr(df['Negative']))

-0.04766703161409033
-0.013450053320842151
-0.10616304329360833
-0.06719223395382772


In [4]:
#Perform Linear Regression to evaluate relationship between sentiment scores and returns

#Select x- and y- variables
X = df[['Positive', 'Negative', 'Neutral']]
y = df['1st-day-return']

#Split training set and test set data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Build regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

#Find coefficient of sentiment variables
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
Positive,-4718733.0
Negative,-4718731.0
Neutral,-4718731.0


In [6]:
#Evaluate linear regression model on test set data
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df.head())

#Calculate MAE, MSE, RMSE
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

      Actual  Predicted
45 -1.000000   0.524380
33 -0.015385   0.296141
40 -0.266000   0.244030
26  0.000400   0.363025
11 -0.153571   0.497860
Mean Absolute Error: 0.43650076821730394
Mean Squared Error: 0.3268147106873461
Root Mean Squared Error: 0.5716771035185387


### 5.2 Vader Sentiment Analysis

In [9]:
df = pd.read_csv('/Users/kwanw4/Documents/complete_data.csv', index_col=[0])

#Add empty sentiment columns for later calculation
df['compound'] = ''
df['negative'] = ''
df['neutral'] = ''
df['positive'] = ''

# Instantiate the Sentiment Analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Calculate sentiment scores
df['compound'] = df['cleansed_text'].apply(lambda x: sid.polarity_scores(x)['compound'])
df['negative'] = df['cleansed_text'].apply(lambda x: sid.polarity_scores(x)['neg'])
df['neutral'] = df['cleansed_text'].apply(lambda x: sid.polarity_scores(x)['neu'])
df['positive'] = df['cleansed_text'].apply(lambda x: sid.polarity_scores(x)['pos'])

# Remove unecessary columns
df = df.drop('cleansed_text',axis=1)

#Display data
df.head()

Unnamed: 0,Symbol,1st-day-return,holding-period-return,compound,negative,neutral,positive
0,AKAN,1.625,1.01,1.0,0.059,0.799,0.142
1,BWV,5.377778,4.163333,1.0,0.071,0.799,0.129
2,MHUA,0.292,-0.259,1.0,0.058,0.819,0.122
3,VIVK,-0.13,-0.464,1.0,0.047,0.816,0.137
4,MDV,1.72,-0.3076,1.0,0.042,0.817,0.141


In [10]:
#Generate summary statistics
df.describe()

Unnamed: 0,1st-day-return,holding-period-return,compound,negative,neutral,positive
count,56.0,56.0,56.0,56.0,56.0,56.0
mean,0.235225,-0.136885,1.0,0.055839,0.801036,0.143
std,0.853582,0.721891,0.0,0.007358,0.010191,0.009456
min,-1.0,-0.819375,1.0,0.032,0.775,0.122
25%,-0.08938,-0.492054,1.0,0.052,0.7965,0.136
50%,0.047571,-0.317917,1.0,0.057,0.801,0.1435
75%,0.243884,-0.027042,1.0,0.06,0.80525,0.14825
max,5.377778,4.163333,1.0,0.071,0.828,0.17


### 5.3 Naive Bayes

In [None]:
df = pd.read_csv('/Users/kwanw4/Documents/complete_data.csv', index_col=[0])

#Download nltk packages
nltk.download('punkt')
nltk.download('stopwords')

#TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

#Declare dependent variable to be the stock returns direction
#If returns is positive, y = 1
#If returns is zero or negative, y = 0
df['1st-day-return'] = np.where(df['1st-day-return']>0, 1, 0)
df['holding-period-return'] = np.where(df['holding-period-return']>0, 1, 0)


#Build model
def NaiveBayes(y_variable):
    y = df[y_variable]
    
    trans = RobustScaler()
    y = np.array (y).reshape ((len( y), 1))
    y = trans.fit_transform(y)
    
    x = vectorizer.fit_transform(df.cleansed_text)
    
    #Get number of observations and unique words
    #print (y.shape)
    #print (x.shape)
    
    #Test Train Split 
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
    
    #Train a naive_bayes classifier
    clf = naive_bayes.MultinomialNB()
    clf.fit(x_train, y_train.ravel())
    
    #Test the model's accuracy
    print(roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]))

In [18]:
#Display ROC AUC score for 1st-day returns and returns to-date
NaiveBayes('1st-day-return')
NaiveBayes('holding-period-return')

0.3877551020408163
0.5454545454545454
