In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
pd.options.display.max_rows = 4000
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [2]:
dataset = pd.read_csv('dataset/fundamentals.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Ticker Symbol,Period Ending,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,Capital Surplus,Cash Ratio,...,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,For Year,Earnings Per Share,Estimated Shares Outstanding
0,0,AAL,2012-12-31,3068000000.0,-222000000.0,-1961000000.0,23.0,-1888000000.0,4695000000.0,53.0,...,7072000000.0,9011000000.0,-7987000000.0,24891000000.0,16904000000.0,24855000000.0,-367000000.0,2012.0,-5.6,335000000.0
1,1,AAL,2013-12-31,4975000000.0,-93000000.0,-2723000000.0,67.0,-3114000000.0,10592000000.0,75.0,...,14323000000.0,13806000000.0,-2731000000.0,45009000000.0,42278000000.0,26743000000.0,0.0,2013.0,-11.25,163022200.0
2,2,AAL,2014-12-31,4668000000.0,-160000000.0,-150000000.0,143.0,-5311000000.0,15135000000.0,60.0,...,11750000000.0,13404000000.0,2021000000.0,41204000000.0,43225000000.0,42650000000.0,0.0,2014.0,4.02,716915400.0
3,3,AAL,2015-12-31,5102000000.0,352000000.0,-708000000.0,135.0,-6151000000.0,11591000000.0,51.0,...,9985000000.0,13605000000.0,5635000000.0,42780000000.0,48415000000.0,40990000000.0,0.0,2015.0,11.39,668129900.0
4,4,AAP,2012-12-29,2409453000.0,-89482000.0,600000.0,32.0,-271182000.0,520215000.0,23.0,...,3184200000.0,2559638000.0,1210694000.0,3403120000.0,4613814000.0,6205003000.0,-27095000.0,2012.0,5.29,73283550.0


In [4]:
dataset.drop(['Ticker Symbol', 'Period Ending', 'Unnamed: 0'], axis = 1, inplace = True)

In [5]:
dataset.columns

Index(['Accounts Payable', 'Accounts Receivable', 'Add'l income/expense items',
       'After Tax ROE', 'Capital Expenditures', 'Capital Surplus',
       'Cash Ratio', 'Cash and Cash Equivalents', 'Changes in Inventories',
       'Common Stocks', 'Cost of Revenue', 'Current Ratio',
       'Deferred Asset Charges', 'Deferred Liability Charges', 'Depreciation',
       'Earnings Before Interest and Tax', 'Earnings Before Tax',
       'Effect of Exchange Rate',
       'Equity Earnings/Loss Unconsolidated Subsidiary', 'Fixed Assets',
       'Goodwill', 'Gross Margin', 'Gross Profit', 'Income Tax',
       'Intangible Assets', 'Interest Expense', 'Inventory', 'Investments',
       'Liabilities', 'Long-Term Debt', 'Long-Term Investments',
       'Minority Interest', 'Misc. Stocks', 'Net Borrowings', 'Net Cash Flow',
       'Net Cash Flow-Operating', 'Net Cash Flows-Financing',
       'Net Cash Flows-Investing', 'Net Income', 'Net Income Adjustments',
       'Net Income Applicable to Common Sha

# Detecting Missing Values

In [6]:
dataset.isnull().sum()

Accounts Payable                                         0
Accounts Receivable                                      0
Add'l income/expense items                               0
After Tax ROE                                            0
Capital Expenditures                                     0
Capital Surplus                                          0
Cash Ratio                                             299
Cash and Cash Equivalents                                0
Changes in Inventories                                   0
Common Stocks                                            0
Cost of Revenue                                          0
Current Ratio                                          299
Deferred Asset Charges                                   0
Deferred Liability Charges                               0
Depreciation                                             0
Earnings Before Interest and Tax                         0
Earnings Before Tax                                     

In [7]:
dataset.dtypes

Accounts Payable                                       float64
Accounts Receivable                                    float64
Add'l income/expense items                             float64
After Tax ROE                                          float64
Capital Expenditures                                   float64
Capital Surplus                                        float64
Cash Ratio                                             float64
Cash and Cash Equivalents                              float64
Changes in Inventories                                 float64
Common Stocks                                          float64
Cost of Revenue                                        float64
Current Ratio                                          float64
Deferred Asset Charges                                 float64
Deferred Liability Charges                             float64
Depreciation                                           float64
Earnings Before Interest and Tax                       

# Correction of missing values

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
dataset.iloc[:,6:7] = imputer.fit_transform(dataset.iloc[:,6:7])
dataset.iloc[:,11:12] = imputer.fit_transform(dataset.iloc[:,11:12])
dataset.iloc[:,58:59] = imputer.fit_transform(dataset.iloc[:,58:59])
dataset.iloc[:,-1:] = imputer.fit_transform(dataset.iloc[:,-1:])
dataset.iloc[:,-2:-1] = imputer.fit_transform(dataset.iloc[:,-2:-1])
dataset.iloc[:,-3:-2] = imputer.fit_transform(dataset.iloc[:,-3:-2])

In [9]:
X = dataset.drop('Earnings Per Share', axis = 1).values
Y = dataset.iloc[:,-2:-1].values

In [92]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(X, Y,test_size=0.15)

# Standardize the Data


In [93]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.fit_transform(x_test)

In [94]:
X_train

array([[-0.20249977,  0.07082054, -0.02959522, ..., -0.27759063,
        -0.67744363, -0.38326756],
       [-0.22708406, -0.03032134, -0.07664309, ...,  0.27200905,
         0.23330849, -0.48679789],
       [-0.30740723,  0.07026354, -0.10399149, ...,  0.08623277,
         1.14406061, -0.45619512],
       ...,
       [-0.2509117 ,  0.01675482, -0.27880078, ..., -0.09383247,
        -0.39934706, -0.00239316],
       [ 0.2802109 , -0.86715696,  1.85549958, ..., -0.13630519,
        -0.67744363,  0.18729179],
       [-0.30276665,  0.05438582, -0.10450978, ...,  0.27200905,
        -0.67744363, -0.46067598]])

# Principal component analysis

In [95]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 20)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [98]:
X_train.shape

(1513, 75)

In [99]:
X_train_pca.shape

(1513, 20)

In [96]:
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
ada = AdaBoostRegressor(base_estimator=XGBRegressor(n_estimators=100, base_score = 3, booster = 'gbtree', gamma = 1, random_state = None), n_estimators=20, learning_rate=1, loss='linear', random_state=1)
ada.fit(X_train_pca, y_train)
y_pred_ada = ada.predict(X_test_pca)

print('Ada R2 Score: ', r2_score(y_test, y_pred_ada))

Ada R2 Score:  0.8023455481602086
