# Linear Discriminant Analysis (LDA)

In [1]:
# Chargement de la base
import numpy as np 
import pandas as pd

DTrain = pd.read_excel("./data/Data_Illustration_Livre_ADL.xlsx",sheet_name="DATA_2_TRAIN",header=0)
DTrain.head()

Unnamed: 0,TYPE,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
0,KIRSCH,336.0,225.0,1.0,1.0,92,37.0,177.0,0.0
1,KIRSCH,442.0,338.0,1.9,10.0,91,30.0,552.0,31.0
2,KIRSCH,373.0,356.0,0.0,29.0,83,27.0,814.0,11.0
3,KIRSCH,418.0,62.0,0.8,0.0,89,24.0,342.0,7.0
4,KIRSCH,84.0,65.0,2.0,2.0,2,0.0,288.0,6.0


In [2]:
from scientisttools.discriminant_analysis import LDA

lda = LDA(features_labels=list(DTrain.columns[1:]),
          target=["TYPE"],
          row_labels=DTrain.index)

# Instanciation
lda.fit(DTrain)

In [3]:
# Prabilité à priori
lda.priors_

TYPE  
POIRE     0.384615
KIRSCH    0.326923
MIRAB     0.288462
dtype: float64

In [4]:
# Matrice de covariance totale
lda.tcov_

Unnamed: 0,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
MEOH,136738.708428,6617.58388,3173.906712,372.960935,7655.131976,3593.549133,-65773.76201,1082.729148
ACET,6617.58388,14921.543661,-99.421071,-146.492066,74.836048,51.906735,12047.594167,427.863371
BU1,3173.906712,-99.421071,121.607692,85.720347,182.120211,79.10727,-2032.141765,10.528265
BU2,372.960935,-146.492066,85.720347,2968.665732,-178.398643,71.995083,22370.955686,-4.595913
ISOP,7655.131976,74.836048,182.120211,-178.398643,2333.864253,754.028959,-3366.731373,32.959879
MEPR,3593.549133,51.906735,79.10727,71.995083,754.028959,341.198959,-731.993431,14.037722
PRO1,-65773.76201,12047.594167,-2032.141765,22370.955686,-3366.731373,-731.993431,392100.202304,626.80951
ACAL,1082.729148,427.863371,10.528265,-4.595913,32.959879,14.037722,626.80951,64.872504


In [5]:
# Matrice de covariance conditionnelle
lda.gcov_

Unnamed: 0_level_0,Unnamed: 1_level_0,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
KIRSCH,MEOH,34259.654412,11295.734191,63.55625,2044.777941,5676.952206,1755.313603,67355.330515,1198.613419
KIRSCH,ACET,11295.734191,14266.467794,17.725,784.245294,2728.963971,854.786985,31507.793676,347.592904
KIRSCH,BU1,63.55625,17.725,0.61625,3.604375,4.08125,2.91875,176.5375,3.12375
KIRSCH,BU2,2044.777941,784.245294,3.604375,461.584044,191.376471,109.921985,14519.423676,77.290404
KIRSCH,ISOP,5676.952206,2728.963971,4.08125,191.376471,1543.632353,424.028676,5333.718382,142.386397
KIRSCH,MEPR,1755.313603,854.786985,2.91875,109.921985,424.028676,150.840588,3019.769191,40.161324
KIRSCH,PRO1,67355.330515,31507.793676,176.5375,14519.423676,5333.718382,3019.769191,563401.932206,2659.623493
KIRSCH,ACAL,1198.613419,347.592904,3.12375,77.290404,142.386397,40.161324,2659.623493,94.126103
MIRAB,MEOH,22953.171429,2391.2,188.171429,448.928571,-825.7,-340.228571,7059.8,404.102857
MIRAB,ACET,2391.2,31890.780952,-321.8,-45.633333,-2517.780952,-672.457143,14724.695238,1031.910476


In [6]:
# Between covariance matrix
lda.bcov_

Unnamed: 0,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
MEOH,96515.005967,-1036.207897,2874.380385,2895.636709,4160.456766,2253.103875,-73697.469933,249.047725
ACET,-1036.207897,-171.929156,10.645256,-295.221822,-218.577017,-171.362485,-1995.822816,-34.817407
BU1,2874.380385,10.645256,85.386427,63.282347,111.884089,57.166454,-2304.675642,7.174939
BU2,2895.636709,-295.221822,63.282347,142.738588,252.648088,163.424639,-1745.576399,18.361615
ISOP,4160.456766,-218.577017,111.884089,252.648088,177.45607,138.154289,-2332.718327,17.893743
MEPR,2253.103875,-171.362485,57.166454,163.424639,138.154289,95.504073,-1037.211943,11.632392
PRO1,-73697.469933,-1995.822816,-2304.675642,-1745.576399,-2332.718327,-1037.211943,51143.250291,-171.998705
ACAL,249.047725,-34.817407,7.174939,18.361615,17.893743,11.632392,-171.998705,-1.273302


In [7]:
# Within covariance matrix
lda.wcov_

Unnamed: 0,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
MEOH,40223.702461,7653.791777,299.526327,-2522.675774,3494.67521,1340.445258,7923.707923,833.681423
ACET,7653.791777,15093.472817,-110.066327,148.729756,293.413065,223.26922,14043.416983,462.680778
BU1,299.526327,-110.066327,36.221265,22.438,70.236122,21.940816,272.533878,3.353327
BU2,-2522.675774,148.729756,22.438,2825.927144,-431.046731,-91.429556,24116.532085,-22.957528
ISOP,3494.67521,293.413065,70.236122,-431.046731,2156.408183,615.87467,-1034.013045,15.066136
MEPR,1340.445258,223.26922,21.940816,-91.429556,615.87467,245.694886,305.218511,2.40533
PRO1,7923.707923,14043.416983,272.533878,24116.532085,-1034.013045,305.218511,340956.952013,798.808215
ACAL,833.681423,462.680778,3.353327,-22.957528,15.066136,2.40533,798.808215,66.145806


In [8]:
# Moyenne conditionnelle
lda.gmean_

Unnamed: 0_level_0,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KIRSCH,371.676471,203.017647,1.2,21.017647,81.588235,28.894118,790.770588,12.011765
MIRAB,934.2,235.066667,20.2,13.566667,90.933333,29.4,195.266667,12.353333
POIRE,1084.35,185.25,21.33,49.38,118.05,50.0,317.4,14.495


In [9]:
# Coeffcients des fonctions de score
lda.coef_

Unnamed: 0,KIRSCH,MIRAB,POIRE
MEOH,0.003428,0.029028,0.03339
ACET,0.00639,0.016413,0.007513
BU1,-0.063681,0.40539,0.318047
BU2,-0.000883,0.071352,0.114993
ISOP,0.023082,0.029763,-0.008486
MEPR,0.037494,-0.128942,0.06178
PRO1,0.001971,-0.005413,-0.008318
ACAL,0.066184,-0.226424,-0.130332


In [10]:
# Constance des fonctions de score
lda.intercept_

Unnamed: 0,KIRSCH,MIRAB,POIRE
Intercept,-5.016453,-18.840685,-24.764879


In [11]:
# Evaluation statistique
se = lda.statistical_evaluation_
se

Unnamed: 0,Wilks L.,Partial L.,"F(2, 42)",p-value
MEOH,0.117975,0.565488,16.136067,6e-06
ACET,0.074153,0.899667,2.341965,0.108572
BU1,0.084183,0.792475,5.499262,0.007563
BU2,0.095695,0.697142,9.122996,0.000513
ISOP,0.07231,0.9226,1.761764,0.184196
MEPR,0.087798,0.759852,6.636945,0.003128
PRO1,0.092396,0.722038,8.084336,0.001071
ACAL,0.075884,0.87915,2.8867,0.066885


In [12]:
DTest = pd.read_excel("./data/Data_Illustration_Livre_ADL.xlsx",sheet_name="DATA_2_TEST",header=0)
DTest.head()

Unnamed: 0,TYPE,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
0,KIRSCH,3,15,0.2,30.0,9,9,350,9.0
1,KIRSCH,475,172,1.9,7.0,113,33,546,14.0
2,KIRSCH,186,101,0.0,1.6,36,11,128,8.0
3,KIRSCH,371,414,1.2,0.0,97,39,502,9.0
4,KIRSCH,583,226,2.3,19.0,120,46,656,11.0


In [13]:
XTest = DTest[DTest.columns[1:]]
yTest = DTest[DTest.columns[0]]
XTest.head()

Unnamed: 0,MEOH,ACET,BU1,BU2,ISOP,MEPR,PRO1,ACAL
0,3,15,0.2,30.0,9,9,350,9.0
1,475,172,1.9,7.0,113,33,546,14.0
2,186,101,0.0,1.6,36,11,128,8.0
3,371,414,1.2,0.0,97,39,502,9.0
4,583,226,2.3,19.0,120,46,656,11.0


In [14]:
#Scores des individus
lda.decision_function(XTest).head()

Unnamed: 0,KIRSCH,MIRAB,POIRE
0,-3.118809,-21.110618,-24.643294
1,3.432298,-7.976511,-11.4895
2,-1.709626,-14.52062,-19.344743
3,4.111012,-7.686396,-12.647287
4,4.778819,-4.320783,-5.750754


In [15]:
pred = lda.predict(XTest)
pred.head()

Unnamed: 0,predict
0,KIRSCH
1,KIRSCH
2,KIRSCH
3,KIRSCH
4,KIRSCH


In [16]:
yTest.head()

0    KIRSCH
1    KIRSCH
2    KIRSCH
3    KIRSCH
4    KIRSCH
Name: TYPE, dtype: object

In [17]:
# Accurary score
lda.score(XTest,yTest)

0.82

In [18]:
lda.predict_proba(XTest).head()

Unnamed: 0,KIRSCH,MIRAB,POIRE
0,1.0,1.535523e-08,4.487814e-10
1,0.999989,1.109717e-05,3.307808e-07
2,0.999997,2.73058e-06,2.193638e-08
3,0.999992,7.523981e-06,5.271811e-08
4,0.999862,0.0001116948,2.673034e-05


In [19]:
lda.squared_mdist_

Unnamed: 0,KIRSCH,MIRAB,POIRE
KIRSCH,0.0,27.37148,36.048105
MIRAB,27.37148,0.0,5.305086
POIRE,36.048105,5.305086,0.0


## Procédure Backward

In [20]:
from scientisttools.discriminant_analysis import STEPDISC
stepdisc = STEPDISC(method="backward",alpha=0.01,model_train=True,verbose=True)
stepdisc.fit(lda)

      Wilks L.  Partial L.          F   p-value
MEOH  0.117975    0.565488  16.136067  0.000006
ACET  0.074153    0.899667   2.341965  0.108572
BU1   0.084183    0.792475   5.499262  0.007563
BU2   0.095695    0.697142   9.122996  0.000513
ISOP  0.072310    0.922600   1.761764  0.184196
MEPR  0.087798    0.759852   6.636945  0.003128
PRO1  0.092396    0.722038   8.084336  0.001071
ACAL  0.075884    0.879150   2.886700  0.066885

      Wilks L.  Partial L.          F   p-value
MEOH  0.129692    0.557551  17.061483  0.000004
ACET  0.079826    0.905849   2.234643  0.119316
BU1   0.092945    0.777989   6.135344  0.004528
BU2   0.109809    0.658508  11.149568  0.000126
MEPR  0.098171    0.736573   7.689228  0.001397
PRO1  0.101216    0.714415   8.594537  0.000724
ACAL  0.081813    0.883851   2.825379  0.070331

      Wilks L.  Partial L.          F   p-value
MEOH  0.147097    0.542673  18.540063  0.000001
BU1   0.098601    0.809585   5.174419  0.009589
BU2   0.122087    0.653844  11.647178 

In [21]:
# Model Final
lda_reduit = stepdisc.train_model_
# Statistical evaluation
lda_reduit.statistical_evaluation_

Unnamed: 0,Wilks L.,Partial L.,"F(2, 45)",p-value
MEOH,0.155113,0.557582,17.852822,2e-06
BU1,0.110174,0.785016,6.161841,0.004313
BU2,0.136572,0.633277,13.029454,3.4e-05
MEPR,0.111753,0.773921,6.572738,0.003131
PRO1,0.131479,0.657808,11.704517,8.1e-05


In [22]:
lda_reduit.coef_

Unnamed: 0,KIRSCH,MIRAB,POIRE
MEOH,0.006172,0.026956,0.031912
BU1,-0.086181,0.346433,0.280117
BU2,-0.005208,0.07042,0.116972
MEPR,0.086537,-0.025564,0.058299
PRO1,0.002536,-0.005289,-0.00836


In [23]:
lda_reduit.intercept_

Unnamed: 0,KIRSCH,MIRAB,POIRE
Intercept,-4.411341,-16.918659,-24.263717
