# Importing Data

In [1]:
# import required package for data handling
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import matplotlib.pyplot as plt
from pandas import DataFrame
%matplotlib inline

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression

In [2]:
#create a dataframe from our data using '-' in place of na values
compdata_original = pd.read_csv('comp1data.csv', header=0, na_values = '-')

#preview the first 5 rows of the dataframe
compdata_original.head()

Unnamed: 0,I1,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,...,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3
0,AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674,10.0,9.5,8.5,11.87,122.0,1.0,3.43,...,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0
1,ABPI,ACCENTIA BIOPHARMACEUTICALS INC,2834,8.0,10.0,8.0,7.25,259.0,0.0,-1.62,...,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0
2,ACAD,ACADIA PHARMACEUTICALS INC,2834,7.0,14.0,12.0,6.7,90.0,1.0,-1.24,...,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0
3,ACHN,ACHILLION PHARMACEUTICALS INC,2834,11.5,16.0,14.0,12.39,209.0,1.0,-0.91,...,4500000.0,8.526,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0
4,ACLI,AMERICAN COMMERCIAL LINES INC.,4492,21.0,21.0,19.0,56.599998,80.0,1.0,0.07,...,8250000.0,632.298,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0


# Descriptive Statistics

In [16]:
compdata_original.shape

(682, 22)

In [17]:
compdata_original.describe(include='all')

Unnamed: 0,I1,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,...,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3
count,682,682,674.0,677.0,672.0,672.0,660.0,660.0,660.0,646.0,...,676.0,610.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0
unique,682,682,201.0,,,,,,,,...,,,,,,,,,,
top,CSCD,"Bois d Arc Energy, Inc.",2834.0,,,,,,,,...,,,,,,,,,,
freq,1,1,76.0,,,,,,,,...,,,,,,,,,,
mean,,,,13.837666,15.48119,13.515045,25.934766,149.728788,0.859091,1.788904,...,12415190.0,500.459962,465.634361,12758.606461,11395.844347,294.353891,679.220264,68.421439,120.104258,144.759178
std,,,,6.053731,6.653429,5.835646,73.234948,152.817467,0.348192,162.666532,...,25128550.0,1648.337634,175.741647,5449.644597,4839.670179,121.532637,472.914323,39.096525,84.828959,69.276285
min,,,,3.0,0.0,3.0,0.0,10.0,0.0,-786.239,...,525000.0,0.074,132.0,0.0,0.0,0.0,-1.0,-1.0,20.0,26.0
25%,,,,10.0,12.5,11.0,11.0,85.0,1.0,-0.8525,...,5000000.0,37.24575,351.0,9195.0,8162.0,213.0,462.0,45.0,73.0,100.0
50%,,,,13.5,15.0,13.0,14.845,107.0,1.0,0.01,...,7398704.0,103.833,444.0,12045.0,10785.0,279.0,624.0,60.0,100.0,134.0
75%,,,,17.0,17.0,15.0,20.485,155.25,1.0,0.47,...,12000000.0,331.138,551.0,15241.0,13760.0,354.0,795.0,85.0,142.0,173.0


In [18]:
compdata_original.median()

P(IPO)     1.350000e+01
P(H)       1.500000e+01
P(L)       1.300000e+01
P(1Day)    1.484500e+01
C1         1.070000e+02
C2         1.000000e+00
C3         1.000000e-02
C4         9.124591e-03
C5         2.740018e+07
C6         7.398704e+06
C7         1.038330e+02
T1         4.440000e+02
T2         1.204500e+04
T3         1.078500e+04
T4         2.790000e+02
T5         6.240000e+02
S1         6.000000e+01
S2         1.000000e+02
S3         1.340000e+02
dtype: float64

In [27]:
# Create a new copy of the data to use 
compdata = compdata_original.copy()

# Imputation

In [31]:
# Drop the rows with mising T2 values
# Cannot do calculations with missing data on the number of words, the ratios will be 0 and create outliers

compdata.drop(compdata[compdata['T1'] == 0].index, inplace = True) 
compdata.drop(compdata[compdata['T2'] == 0].index, inplace = True) 
print(compdata.loc[compdata['T1'] == 0])
print(compdata.loc[compdata['T2'] == 0])

Empty DataFrame
Columns: [I1, I2, I3, P(IPO), P(H), P(L), P(1Day), C1, C2, C3, C4, C5, C6, C7, T1, T2, T3, T4, T5, S1, S2, S3]
Index: []

[0 rows x 22 columns]
Empty DataFrame
Columns: [I1, I2, I3, P(IPO), P(H), P(L), P(1Day), C1, C2, C3, C4, C5, C6, C7, T1, T2, T3, T4, T5, S1, S2, S3]
Index: []

[0 rows x 22 columns]


# Calculation

# Normalization

# Z Score 

# Standard Deviation

# Continuous Correlation

# Categorical Correlation

# Binning

# Descriptive Statistics

# Feature Selection