# Canonical Discriminant Analysis (CANDISC)

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [2]:
# Chargement des librairies

import pandas as pd

# Chargement de la base
DTrain = pd.read_excel("./data/wine_quality.xls",header=0,index_col=0)
DTrain.head(6)

Unnamed: 0_level_0,Annee,Temperature,Soleil,Chaleur,Pluie,Qualite
Obs.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1924,3064,1201,10,361,Moyen
2,1925,3000,1053,11,338,Mediocre
3,1926,3155,1133,19,393,Moyen
4,1927,3085,970,4,467,Mediocre
5,1928,3245,1258,36,294,Bon
6,1929,3267,1386,35,225,Bon


## Set annee to index

In [3]:
DTrain = DTrain.set_index("Annee")
DTrain.head(6)

Unnamed: 0_level_0,Temperature,Soleil,Chaleur,Pluie,Qualite
Annee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1924,3064,1201,10,361,Moyen
1925,3000,1053,11,338,Mediocre
1926,3155,1133,19,393,Moyen
1927,3085,970,4,467,Mediocre
1928,3245,1258,36,294,Bon
1929,3267,1386,35,225,Bon


In [4]:
from discrimintools import CANDISC

In [5]:
candisc = CANDISC(n_components=2,target=["Qualite"],priors=None,parallelize=False)
# Entraînement
candisc.fit(DTrain)

In [7]:
candisc.class_

Unnamed: 0,infos,Value,DF,DF value
0,Total Sample Size,34,DF Total,33
1,Variables,4,DF Within Classes,31
2,Classes,3,DF Between Classes,2


In [6]:
candisc.predict(X=DTrain)

['Moyen' 'Mediocre' 'Mediocre' 'Mediocre' 'Bon' 'Bon' 'Mediocre'
 'Mediocre' 'Mediocre' 'Bon' 'Bon' 'Moyen' 'Mediocre' 'Bon' 'Moyen'
 'Moyen' 'Moyen' 'Mediocre' 'Moyen' 'Bon' 'Moyen' 'Bon' 'Moyen' 'Bon'
 'Moyen' 'Bon' 'Bon' 'Mediocre' 'Bon' 'Moyen' 'Mediocre' 'Moyen' 'Moyen'
 'Mediocre']


TypeError: Series.name must be a hashable type

In [None]:
from scientisttools.ggplot import fviz_candisc

In [None]:
p = fviz_candisc(candisc)
print(p)

In [None]:
candisc.correlation_ratio_

In [None]:
candisc.anova_

In [None]:
print(candisc.manova_) # ne pas oublier d'utiliser print

## Coefficients canoniques bruts

In [None]:
from scientisttools.extractfactor import get_candisc_coef

In [None]:
# Coefficients
coef = get_candisc_coef(candisc)
coef

In [None]:
from scientisttools.pyplot import plotCANDISC
import matplotlib.pyplot as plt 

fig, axe =plt.subplots(figsize=(16,8))
plotCANDISC(candisc,color=["blue",'#5DC83F','red'],marker=['o',"*",'>'],ax=axe)
plt.show()

In [None]:
candisc.global_performance_

In [None]:
candisc.likelihood_test_

In [None]:
from scientisttools.extractfactor import get_candisc_var
# Covariance
pd.concat(get_candisc_var(candisc,choice="covariance"),axis=0)

In [None]:
# Correlation avec les axes
pd.concat(get_candisc_var(candisc,choice="correlation"),axis=0)

### Individus supplémentaires

In [None]:
## Inidvidu supplémentaire
XTest = pd.DataFrame({"Temperature" : 3000, "Soleil" : 1100, "Chaleur" : 20, "Pluie" : 300},index=[1958])
XTest

In [None]:
candisc.transform(XTest)

In [None]:
candisc.decision_function(XTest)

In [None]:
candisc.predict_proba(XTest)

## Fonctions de décision

In [None]:
score_coef = get_candisc_coef(candisc,choice="score")
score_coef

In [None]:
XTrain = DTrain.drop(columns=["Qualite"])

In [None]:
candisc.decision_function(XTrain).head()

In [None]:
candisc.predict_proba(XTrain).head()

In [None]:
candisc.predict(XTrain).head()

In [None]:
# score 
candisc.score(XTrain,DTrain["Qualite"])

In [None]:
from scientisttools.extractfactor import summaryCANDISC
summaryCANDISC(candisc,to_markdown=True)

## Backward Elimination

In [None]:
from scientisttools.discriminant_analysis import STEPDISC

stepdisc = STEPDISC(method="backward",alpha=0.01,model_train=True,verbose=True)
stepdisc.fit(candisc)

In [None]:
stepdisc.train_model_

In [None]:
fig, axe =plt.subplots(figsize=(16,8))
plotCANDISC(stepdisc.train_model_,color=["blue",'#5DC83F','red'],marker=['o',"*",'>'],ax=axe)
plt.show()

In [None]:
# Summary
summaryCANDISC(stepdisc.train_model_,to_markdown=True)