In [7]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go


def correspondence_analysis(excel_path):
    data = pd.read_excel(excel_path,index_col=0)
    print('Original Contingency table:\n',data)

    k = data.values.sum()
    print('\nTotal observations k =',k)

    frequency_table = data/k
    frequency_table['fi.'] = frequency_table.sum(axis=1)
    frequency_table.loc['f.j'] = frequency_table.sum(axis=0)

    print('\nFrequency table:\n',frequency_table.round(4))

    row_prof_tab = frequency_table.iloc[:-1,:-1].div(frequency_table['fi.'][:-1], axis=0).round(4)
    row_prof_tab['fi.'] = frequency_table['fi.'][:-1].round(4)
    print("\nRow profile table:\n",row_prof_tab)

    col_prof_tab = frequency_table.iloc[:-1,:-1].div(frequency_table.loc['f.j'][:-1],axis=1).round(4)
    col_prof_tab.loc['f.j'] = frequency_table.loc['f.j'][:-1].round(4)
    print("\nColumn profile table:\n",col_prof_tab)

    fij = frequency_table.iloc[:-1,:-1].values
    fi = frequency_table['fi.'][:-1].values
    fj = frequency_table.loc['f.j'][:-1].values

    expected = np.outer(fi,fj)
    
    centered =(fij - expected).round(4)

    di = np.diag(1/np.sqrt(fi)).round(3)
    dj = np.diag(1/np.sqrt(fj)).round(3)

    X = (di @ centered @ dj).round(4)
    print("\nX:\n",X)

    S =np.dot(X.T,X).round(4)
    print("\nS:\n",S)

    eigenvalues ,eigenvectors = np.linalg.eig(S)

    idx = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalues = eigenvalues[idx]
    sorted_eigenvectors = eigenvectors[:,idx]

    n_componets=len(sorted_eigenvalues)-1

    print("\nThe eigenvalues:",eigenvalues)
    print("\nThe eigenvectors:\n",eigenvectors)

    print("\nThe sorted eigenvalues:",sorted_eigenvalues[:n_componets])
    print("\nThe sorted eigenvectors:\n",sorted_eigenvectors[:,:n_componets].round(3))


    Cik = np.dot(X , sorted_eigenvectors[:,:n_componets])
    print("\nCik:\n",Cik.round(4))

    Cjk = np.sqrt(sorted_eigenvalues[:n_componets]) * sorted_eigenvectors[:,:n_componets]
    print("\nCjk:\n",Cjk.round(4))

    row_labels = data.index.tolist()
    col_labels = data.columns.tolist()

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=Cik[:,0], y=Cik[:,1],
        mode ='markers+text',
        marker= dict(size=10,color = 'blue'),
        text=row_labels,
        textposition='top center',
        name="Row Profiles"
    ))

    fig.add_trace(go.Scatter(
        x=Cjk[:,0],y=Cjk[:,1],
        mode='markers+text',
        marker=dict(size=10,color='red',symbol='diamond'),
        text=col_labels,
        textposition='bottom center',
        name="Column Profiles"
    ))

    fig.add_shape(
        type="line",
        x0=-0.7,y0=0, x1=0.7,y1=0,
        line=dict(color='grey',width=1,dash='dash'),
        layer='below'
    )

    fig.add_shape(
        type="line",
        x0=0,y0=-0.3, x1=0,y1=0.3,
        line=dict(color='black',width=1,dash='dash'),
        layer='below',
    )

    fig.update_layout(
        width=1600,
        height=1000,
        title="Correspondence Analysis Biplot",
        title_x = 0.5,
        xaxis_title="Dimension 1",
        yaxis_title="Dimension 2",
        showlegend=True,
        hovermode='closest',
        xaxis=dict(
            tickmode='array',
            tickvals=np.arange(-0.8, 0.9, 0.2).round(1),
            range=[-0.7, 0.7] ,
            zeroline=True,
            zerolinecolor='black',
            zerolinewidth=1,
        ),
        yaxis=dict(
            range=[-0.3, 0.3] ,
            zeroline=True,
            zerolinecolor='black',
            zerolinewidth=1,
    ))

    fig.show()


correspondence_analysis('yeux_cheveux.xlsx')
print('\n')
correspondence_analysis('niveau_secteur.xlsx')

Original Contingency table:
               Blond  Marron  Noir
Yeux\Cheveux                     
Vert             50      10     3
Bleu             43       8     0
Marron            8      66    33
Noir              2      44    50

Total observations k = 317

Frequency table:
                Blond  Marron    Noir     fi.
Yeux\Cheveux                                
Vert          0.1577  0.0315  0.0095  0.1987
Bleu          0.1356  0.0252  0.0000  0.1609
Marron        0.0252  0.2082  0.1041  0.3375
Noir          0.0063  0.1388  0.1577  0.3028
f.j           0.3249  0.4038  0.2713  1.0000

Row profile table:
                Blond  Marron    Noir     fi.
Yeux\Cheveux                                
Vert          0.7937  0.1587  0.0476  0.1987
Bleu          0.8431  0.1569  0.0000  0.1609
Marron        0.0748  0.6168  0.3084  0.3375
Noir          0.0208  0.4583  0.5208  0.3028

Column profile table:
                Blond  Marron    Noir
Yeux\Cheveux                        
Vert          0.



Original Contingency table:
                 Public  Privé  Indépendant
Niveau\Secteur                            
Bac                 25     40           10
Licence             35     60           20
Master              20     30           15

Total observations k = 255

Frequency table:
                 Public   Privé  Indépendant     fi.
Niveau\Secteur                                     
Bac             0.0980  0.1569       0.0392  0.2941
Licence         0.1373  0.2353       0.0784  0.4510
Master          0.0784  0.1176       0.0588  0.2549
f.j             0.3137  0.5098       0.1765  1.0000

Row profile table:
                 Public   Privé  Indépendant     fi.
Niveau\Secteur                                     
Bac             0.3333  0.5333       0.1333  0.2941
Licence         0.3043  0.5217       0.1739  0.4510
Master          0.3077  0.4615       0.2308  0.2549

Column profile table:
                 Public   Privé  Indépendant
Niveau\Secteur                             
Ba

In [8]:
correspondence_analysis('category-country.xlsx')

Original Contingency table:
                     Chemistry  Economics  Literature  Medicine  Peace  Physics
country \ Category                                                            
Australia                   0          0           0         4      0        1
Austria                     1          1           2         4      1        2
Belgium                     1          0           1         3      4        1
Canada                      3          0           2         3      1        3
China                       0          0           2         1      2        0
Denmark                     2          0           4         4      1        3
France                     10          2          13        10      9       15
Germany                    33          1           6        16      6       24
India                       0          0           2         0      1        1
Ireland                     0          0           4         0      4        1
Italy                  

In [9]:
correspondence_analysis('category-gender.xlsx')

Original Contingency table:
                  female  male
Category\gender              
Chemistry             7   182
Economics             3    89
Literature           17    99
Medicine             13   214
Peace                16    88
Physics               5   220

Total observations k = 953

Frequency table:
                  female    male     fi.
Category\gender                        
Chemistry        0.0073  0.1910  0.1983
Economics        0.0031  0.0934  0.0965
Literature       0.0178  0.1039  0.1217
Medicine         0.0136  0.2246  0.2382
Peace            0.0168  0.0923  0.1091
Physics          0.0052  0.2308  0.2361
f.j              0.0640  0.9360  1.0000

Row profile table:
                  female    male     fi.
Category\gender                        
Chemistry        0.0370  0.9630  0.1983
Economics        0.0326  0.9674  0.0965
Literature       0.1466  0.8534  0.1217
Medicine         0.0573  0.9427  0.2382
Peace            0.1538  0.8462  0.1091
Physics          0.0222

IndexError: index 1 is out of bounds for axis 1 with size 1