# Canonical discriminant analysis

In [1]:
# Chargement des donnees
import pandas as pd
import numpy as np
from plydata import *
import os
os.chdir("d:\\Bureau\\PythonProject\\packages\\scientisttools")

In [2]:
wine = pd.read_excel("./data/wine_quality.xls",index_col=1)
display(wine >> head()) 

Unnamed: 0_level_0,Obs.,Temperature,Soleil,Chaleur,Pluie,Qualite
Annee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1924,1,3064,1201,10,361,Moyen
1925,2,3000,1053,11,338,Mediocre
1926,3,3155,1133,19,393,Moyen
1927,4,3085,970,4,467,Mediocre
1928,5,3245,1258,36,294,Bon


In [3]:
from scientisttools.discriminant_analysis import CANDISC

In [4]:
my_cda = CANDISC(n_components=2,
                 target=["Qualite"],
                 row_labels=wine.index,
                 features_labels=["Temperature","Soleil","Chaleur","Pluie"])
my_cda.fit(wine)

              Temperature        Soleil     Chaleur        Pluie
Temperature  12354.923771  10646.046975  766.804551 -5522.721715
Soleil       10646.046975   9614.972646  641.521220 -5173.700666
Chaleur        766.804551    641.521220   48.428528  -324.701111
Pluie        -5522.721715  -5173.700666 -324.701111  2858.563830


In [5]:
my_cda.classes_

array(['Bon', 'Mediocre', 'Moyen'], dtype=object)

In [6]:
my_cda.summary_information_

Unnamed: 0,value
Total Sample Size,34
Variables,4
Classes,3
DF Total,33
DF Within Classes,31
DF Between Classes,2


In [7]:
my_cda.class_level_information_

Unnamed: 0_level_0,Frequency,Proportion
Qualite,Unnamed: 1_level_1,Unnamed: 2_level_1
Mediocre,12,0.352941
Bon,11,0.323529
Moyen,11,0.323529


In [8]:
my_cda.squared_mdist_

Unnamed: 0,Bon,Mediocre,Moyen
Bon,0.0,17.652729,4.519311
Mediocre,17.652729,0.0,5.492267
Moyen,4.519311,5.492267,0.0


In [9]:
my_cda.univariate_test_statistis_

Unnamed: 0,Std. Dev.,R-squared,Rsq/(1-Rsq),F-statistic,Prob (F-statistic)
Temperature,141.184334,0.638605,1.767052,27.38931,1.408416e-07
Soleil,126.622972,0.617857,1.616822,25.060741,3.345802e-07
Chaleur,10.016564,0.497312,0.989305,15.33422,2.344932e-05
Pluie,91.401608,0.352537,0.544491,8.439607,0.001185298


In [10]:
anova = my_cda.anova_
print(anova.keys())

dict_keys(['Temperature', 'Soleil', 'Chaleur', 'Pluie'])


In [11]:
display(anova["Temperature"])

Unnamed: 0,sum_sq,df,mean_sq,F,PR(>F),eta_sq,omega_sq
C(Qualite),420067.4082,2.0,210033.7041,27.38931,1.408416e-07,0.638605,0.608198
Residual,237722.121212,31.0,7668.455523,,,,


In [12]:
my_cda.correlation_ratio_

Unnamed: 0,Sum. Intra,Sum. Inter,correlation ratio,F-stats,pvalue
Temperature,237722.1212,420067.4082,0.6386,27.3893,0.0
Soleil,202192.3712,326909.07,0.6179,25.0607,0.0
Chaleur,1664.3712,1646.57,0.4973,15.3342,0.0
Pluie,178499.2121,97191.1702,0.3525,8.4396,0.0012


In [13]:
print(my_cda.manova_)

                 Multivariate linear model
                                                            
------------------------------------------------------------
        Qualite         Value  Num DF  Den DF F Value Pr > F
------------------------------------------------------------
          Wilks' lambda 0.2053 8.0000 56.0000  8.4505 0.0000
         Pillai's trace 0.8880 8.0000 58.0000  5.7896 0.0000
 Hotelling-Lawley trace 3.4174 8.0000 37.7500 11.7280 0.0000
    Roy's greatest root 3.2789 4.0000 29.0000 23.7717 0.0000



In [14]:
tukey = my_cda.tukey_
print(tukey["Temperature"])

     Multiple Comparison of Means - Tukey HSD, FWER=0.05     
 group1   group2   meandiff p-adj    lower     upper   reject
-------------------------------------------------------------
     Bon Mediocre -269.0303    0.0 -358.9958 -179.0648   True
     Bon    Moyen -165.4545 0.0003  -257.355  -73.5541   True
Mediocre    Moyen  103.5758 0.0213   13.6103  193.5412   True
-------------------------------------------------------------


In [15]:
bonf_test = my_cda.bonferroni_correction_
print(bonf_test["Temperature"])

Test Multiple Comparison ttest_ind 
FWER=0.05 method=bonf
alphacSidak=0.02, alphacBonf=0.017
 group1   group2    stat   pval  pval_corr reject
-------------------------------------------------
     Bon Mediocre  7.9611    0.0       0.0   True
     Bon    Moyen  4.0363 0.0006    0.0019   True
Mediocre    Moyen -2.9072 0.0084    0.0253   True
-------------------------------------------------


In [16]:
sidak = my_cda.sidak_
print(sidak["Temperature"])

Test Multiple Comparison ttest_ind 
FWER=0.05 method=sidak
alphacSidak=0.02, alphacBonf=0.017
 group1   group2    stat   pval  pval_corr reject
-------------------------------------------------
     Bon Mediocre  7.9611    0.0       0.0   True
     Bon    Moyen  4.0363 0.0006    0.0019   True
Mediocre    Moyen -2.9072 0.0084    0.0251   True
-------------------------------------------------


In [17]:
my_cda.tcov_

Unnamed: 0,Temperature,Soleil,Chaleur,Pluie
Temperature,19346.750865,12360.302768,1187.420415,-5130.448097
Soleil,12360.302768,15561.807093,795.792388,-5317.760381
Chaleur,1187.420415,795.792388,97.380623,-356.451557
Pluie,-5130.448097,-5317.760381,-356.451557,8108.540657


In [18]:
my_cda.tcorr_

Unnamed: 0,LD1,LD2
Temperature,-0.900589,-0.374779
Soleil,-0.896744,0.11619
Chaleur,-0.770513,-0.59003
Pluie,0.662815,-0.361294


In [19]:
my_cda.gcov_

Unnamed: 0_level_0,Unnamed: 1_level_0,Temperature,Soleil,Chaleur,Pluie
Qualite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bon,Temperature,8474.454545,3390.845455,585.281818,-2709.1
Bon,Soleil,3390.845455,6449.054545,163.118182,-1336.1
Bon,Chaleur,585.281818,163.118182,77.472727,-306.1
Bon,Pluie,-2709.1,-1336.1,-306.1,2734.6
Mediocre,Temperature,4807.878788,-1003.606061,233.515152,227.060606
Mediocre,Soleil,-1003.606061,7813.356061,157.416667,-59.787879
Mediocre,Chaleur,233.515152,157.416667,39.719697,-25.575758
Mediocre,Pluie,227.060606,-59.787879,-25.575758,10992.606061
Moyen,Temperature,10009.090909,3541.590909,587.945455,3793.063636
Moyen,Soleil,3541.590909,5175.490909,188.245455,912.063636


In [20]:
my_cda.wcov_

Unnamed: 0,Temperature,Soleil,Chaleur,Pluie
Temperature,6991.827094,1714.255793,420.615865,392.273619
Soleil,1714.255793,5946.834447,154.271168,-144.059715
Chaleur,420.615865,154.271168,48.952094,-31.750446
Pluie,392.273619,-144.059715,-31.750446,5249.976827


In [21]:
my_cda.wcorr_

Unnamed: 0,LD1,LD2
Temperature,-0.724221,-0.584256
Soleil,-0.70128,0.176148
Chaleur,-0.525372,-0.77991
Pluie,0.398218,-0.420797


In [22]:
my_cda.bcorr_

Unnamed: 0,LD1,LD2
Temperature,-0.986651,-0.211244
Soleil,-0.998654,0.002625
Chaleur,-0.957391,-0.335599
Pluie,0.976576,-0.166812


In [23]:
my_cda.eig_.T

array([[  3.27886049,   3.14028647,  95.94508632,  95.94508632],
       [  0.13857402,          nan,   4.05491368, 100.        ]])

In [24]:
my_cda.intercept_

array([32.87628192, -2.16527944])

In [25]:
my_cda.coef_

array([[-8.56604551e-03,  4.62505890e-05],
       [-6.77386899e-03,  5.32929330e-03],
       [ 2.70544919e-02, -1.27636164e-01],
       [ 5.86566500e-03, -6.17455623e-03]])

In [26]:
from scientisttools.extractfactor import get_candisc
row_coord = get_candisc(my_cda)
row_coord["coord"].head()

Unnamed: 0_level_0,LD1,LD2
Annee,Unnamed: 1_level_1,Unnamed: 2_level_1
1924,0.882552,0.871537
1925,2.325456,0.09422
1926,0.994856,-0.832957
1927,2.726862,-0.247244
1928,-0.743596,-1.721167


In [27]:
my_cda.global_performance_

Unnamed: 0,Stat,Value,p-value
0,Wilks' Lambda,0.205263,
1,Bartlett -- C(8),46.712169,1.739815e-07
2,"Rao -- F(8,56)",8.450507,1.890358e-07


In [28]:
my_cda.likelihood_test_

Unnamed: 0,statistic,DDL num.,DDL den.,Pr>F
0,8.450507,8.0,56.0,1.890358e-07
1,1.339549,3.0,29.0,0.280785


In [29]:
transform = my_cda.transform(wine.drop(columns=["Qualite","Obs."]))
transform.iloc[:5,:]

Unnamed: 0_level_0,LD1,LD2
Annee,Unnamed: 1_level_1,Unnamed: 2_level_1
1924,0.882552,0.871537
1925,2.325456,0.09422
1926,0.994856,-0.832957
1927,2.726862,-0.247244
1928,-0.743596,-1.721167


In [30]:
Xtest = pd.DataFrame(np.array([3000,1100,20,300]).reshape(1,4),index=['1958'],
                     columns = my_cda.features_labels_)
Xtest

Unnamed: 0,Temperature,Soleil,Chaleur,Pluie
1958,3000,1100,20,300


In [31]:
my_cda.transform(Xtest)

Unnamed: 0,LD1,LD2
1958,2.027679,-0.569395


In [32]:
X = wine[my_cda.features_labels_]

In [33]:
predict_proba = my_cda.predict_proba(X)
predict_proba.head().round(4)

Unnamed: 0_level_0,Bon,Mediocre,Moyen
Annee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1924,0.0067,0.3446,0.6487
1925,0.0,0.9588,0.0411
1926,0.0092,0.698,0.2927
1927,0.0,0.9865,0.0135
1928,0.6417,0.0313,0.327


In [34]:
my_cda.decision_function(Xtest)

Unnamed: 0,0,1,2
1958,-7.56466,1.114437,-1.859627


In [35]:
predict_proba = my_cda.predict_proba(X)
predict_proba.head()

Unnamed: 0_level_0,Bon,Mediocre,Moyen
Annee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1924,0.006695,0.344613,0.648692
1925,4.5e-05,0.958846,0.041109
1926,0.009222,0.698039,0.292739
1927,9e-06,0.986519,0.013472
1928,0.641715,0.031256,0.327029


In [36]:
my_cda.predict(Xtest)

Unnamed: 0,predict
1958,Mediocre


In [37]:
my_cda.score(X,wine["Qualite"])

0.7941176470588235

In [38]:
from scientisttools.extractfactor import get_eigenvalue,summaryCANDISC,get_candisc_coef
eig = get_eigenvalue(my_cda)
eig

Unnamed: 0,eigenvalue,difference,proportion,cumulative
LD1,3.27886,3.140286,95.945086,95.945086
LD2,0.138574,,4.054914,100.0


In [39]:
coef = get_candisc_coef(my_cda,choice="absolute")
coef

Unnamed: 0,LD1,LD2
Temperature,-0.008566,4.6e-05
Soleil,-0.006774,0.005329
Chaleur,0.027054,-0.127636
Pluie,0.005866,-0.006175
Intercept,32.876282,-2.165279


In [40]:
coef = get_candisc_coef(my_cda,choice="score")
coef

Unnamed: 0,Bon,Mediocre,Moyen
Temperature,0.018164,-0.017821,0.001277
Soleil,0.012925,-0.015263,0.003726
Chaleur,-0.022716,0.084484,-0.069449
Pluie,-0.010768,0.013562,-0.004026
Intercept,-72.590473,65.609287,-7.191833


In [41]:
summaryCANDISC(my_cda,to_markdown=True)

                     Canonical Discriminant Analysis - Results                     


Summary Information
|       |   Total Sample Size |   Variables |   Classes |   DF Total |   DF Within Classes |   DF Between Classes |
|:------|--------------------:|------------:|----------:|-----------:|--------------------:|---------------------:|
| value |                  34 |           4 |         3 |         33 |                  31 |                    2 |

Class Level information
|               |   Frequency |   Proportion |
|:--------------|------------:|-------------:|
| ('Mediocre',) |          12 |     0.352941 |
| ('Bon',)      |          11 |     0.323529 |
| ('Moyen',)    |          11 |     0.323529 |

Importance of components
|                         |    LD1 |     LD2 |
|:------------------------|-------:|--------:|
| Variance                |  3.279 |   0.139 |
| Difference              |  3.14  | nan     |
| % of var.               | 95.945 |   4.055 |
| Cumulative of % of var.