In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
pd.options.display.max_rows = 4000
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [2]:
iris = sns.load_dataset('iris')


In [3]:
X = iris.iloc[:,:-1].values
Y = iris.iloc[:,-1:].values

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(X, Y,test_size=0.2, random_state=0)

# Scaling

Scaling should be applied before applying dimension reduction.

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.fit_transform(x_test)

# PCA and LDA objects are created.

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components = 2)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

# Modelling

In [7]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb_pca =XGBClassifier()
xgb_lda = XGBClassifier()

xgb.fit(X_train, y_train)
xgb_pca.fit(X_train_pca, y_train)
xgb_lda.fit(X_train_lda, y_train)

y_pred = xgb.predict(X_test)
y_pred_pca = xgb_pca.predict(X_test_pca)
y_pred_lda = xgb_lda.predict(X_test_lda)


# Comparison

In [8]:
from sklearn.metrics import confusion_matrix


print('XGB\n', confusion_matrix(y_test, y_pred))
print('\n\nXGB PCA\n', confusion_matrix(y_test, y_pred_pca))
print('\n\nXGB LDA\n', confusion_matrix(y_test, y_pred_lda))

XGB
 [[11  0  0]
 [ 0  9  4]
 [ 0  0  6]]


XGB PCA
 [[11  0  0]
 [ 0  6  7]
 [ 0  0  6]]


XGB LDA
 [[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]


**Application of PCA on New York Stock Exchange data set.**

In [9]:
dataset = pd.read_csv('fundamentals.csv')
print(dataset.columns)

Index(['Unnamed: 0', 'Ticker Symbol', 'Period Ending', 'Accounts Payable',
       'Accounts Receivable', 'Add'l income/expense items', 'After Tax ROE',
       'Capital Expenditures', 'Capital Surplus', 'Cash Ratio',
       'Cash and Cash Equivalents', 'Changes in Inventories', 'Common Stocks',
       'Cost of Revenue', 'Current Ratio', 'Deferred Asset Charges',
       'Deferred Liability Charges', 'Depreciation',
       'Earnings Before Interest and Tax', 'Earnings Before Tax',
       'Effect of Exchange Rate',
       'Equity Earnings/Loss Unconsolidated Subsidiary', 'Fixed Assets',
       'Goodwill', 'Gross Margin', 'Gross Profit', 'Income Tax',
       'Intangible Assets', 'Interest Expense', 'Inventory', 'Investments',
       'Liabilities', 'Long-Term Debt', 'Long-Term Investments',
       'Minority Interest', 'Misc. Stocks', 'Net Borrowings', 'Net Cash Flow',
       'Net Cash Flow-Operating', 'Net Cash Flows-Financing',
       'Net Cash Flows-Investing', 'Net Income', 'Net Income Ad

In [10]:
dataset.head()


Unnamed: 0.1,Unnamed: 0,Ticker Symbol,Period Ending,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,Capital Surplus,Cash Ratio,...,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,For Year,Earnings Per Share,Estimated Shares Outstanding
0,0,AAL,2012-12-31,3068000000.0,-222000000.0,-1961000000.0,23.0,-1888000000.0,4695000000.0,53.0,...,7072000000.0,9011000000.0,-7987000000.0,24891000000.0,16904000000.0,24855000000.0,-367000000.0,2012.0,-5.6,335000000.0
1,1,AAL,2013-12-31,4975000000.0,-93000000.0,-2723000000.0,67.0,-3114000000.0,10592000000.0,75.0,...,14323000000.0,13806000000.0,-2731000000.0,45009000000.0,42278000000.0,26743000000.0,0.0,2013.0,-11.25,163022200.0
2,2,AAL,2014-12-31,4668000000.0,-160000000.0,-150000000.0,143.0,-5311000000.0,15135000000.0,60.0,...,11750000000.0,13404000000.0,2021000000.0,41204000000.0,43225000000.0,42650000000.0,0.0,2014.0,4.02,716915400.0
3,3,AAL,2015-12-31,5102000000.0,352000000.0,-708000000.0,135.0,-6151000000.0,11591000000.0,51.0,...,9985000000.0,13605000000.0,5635000000.0,42780000000.0,48415000000.0,40990000000.0,0.0,2015.0,11.39,668129900.0
4,4,AAP,2012-12-29,2409453000.0,-89482000.0,600000.0,32.0,-271182000.0,520215000.0,23.0,...,3184200000.0,2559638000.0,1210694000.0,3403120000.0,4613814000.0,6205003000.0,-27095000.0,2012.0,5.29,73283550.0


In [11]:
dataset.drop(['Ticker Symbol', 'Period Ending', 'Unnamed: 0'], axis = 1, inplace = True)


In [12]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 76 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Accounts Payable                                     1781 non-null   float64
 1   Accounts Receivable                                  1781 non-null   float64
 2   Add'l income/expense items                           1781 non-null   float64
 3   After Tax ROE                                        1781 non-null   float64
 4   Capital Expenditures                                 1781 non-null   float64
 5   Capital Surplus                                      1781 non-null   float64
 6   Cash Ratio                                           1482 non-null   float64
 7   Cash and Cash Equivalents                            1781 non-null   float64
 8   Changes in Inventories                               1781 non-null  

# Detecting Missing Values


In [13]:
dataset.isnull().sum()


Accounts Payable                                         0
Accounts Receivable                                      0
Add'l income/expense items                               0
After Tax ROE                                            0
Capital Expenditures                                     0
Capital Surplus                                          0
Cash Ratio                                             299
Cash and Cash Equivalents                                0
Changes in Inventories                                   0
Common Stocks                                            0
Cost of Revenue                                          0
Current Ratio                                          299
Deferred Asset Charges                                   0
Deferred Liability Charges                               0
Depreciation                                             0
Earnings Before Interest and Tax                         0
Earnings Before Tax                                     

# Correction of missing values


In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(dataset.iloc[:,6:7])
dataset.iloc[:,6:7] = imputer.transform(dataset.iloc[:,6:7])

imputer2 = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer2.fit(dataset.iloc[:,11:12])
dataset.iloc[:,11:12] = imputer2.transform(dataset.iloc[:,11:12])

imputer3 = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer3.fit(dataset.iloc[:,58:59])
dataset.iloc[:,58:59] = imputer3.transform(dataset.iloc[:,58:59])

imputer4 = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer4.fit(dataset.iloc[:,-1:])
dataset.iloc[:,-1:] = imputer3.transform(dataset.iloc[:,-1:])

imputer5 = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer5.fit(dataset.iloc[:,-2:-1])
dataset.iloc[:,-2:-1] = imputer3.transform(dataset.iloc[:,-2:-1])

imputer5 = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer5.fit(dataset.iloc[:,-3:-2])
dataset.iloc[:,-3:-2] = imputer3.transform(dataset.iloc[:,-3:-2])

In [15]:
X = dataset.drop('Earnings Per Share', axis = 1).values
Y = dataset.iloc[:,-2:-1].values

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(X, Y,test_size=0.2, random_state=0)

In [17]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.fit_transform(x_test)

# Standardize the Data


# Principal component analysis

In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 43)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [19]:
X_train.shape


(1424, 75)

In [23]:
X_train_pca.shape



(1424, 43)

# Modelling


In [21]:

from xgboost import XGBRegressor

xgb_non_pca = XGBRegressor()
xgb_with_pca = XGBRegressor()

xgb_non_pca.fit(X_train, y_train)
xgb_with_pca.fit(X_train_pca, y_train)

pred_non_pca = xgb_non_pca.predict(X_test)
pred_with_pca = xgb_with_pca.predict(X_test_pca)

In [22]:

print('NON PCA', r2_score(y_test, pred_non_pca))
print('PCA', r2_score(y_test, pred_with_pca))

NON PCA 0.11408749118943873
PCA 0.7136849647954208
