In [1]:
%pip install discopula


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


> Make sure to have discopula's latest version installed using `pip`. More information about the latest version can be found at https://pypi.org/project/discopula/

Run the following upgrade commands on your terminal if you are facing issues related to `pip` or `scipy`

```
# pip install --upgrade pip
# pip install --upgrade scipy
```

In [2]:
import numpy as np
from discopula import GenericCheckerboardCopula

# 2-Dimensional Case 

### Create Sample Contingency Table and Initialize the GenericCheckerboardCopula

In the case of initialization of a `GenericCheckerboardCopula` object with a contingency table (`np.array(np.array(...))`), the axis indexing defaults to the outermost starting with 0. In this case of a 2D contingency table, `axis = 0` has 5 categories and `axis = 1` has 3 categories. According to the JMA2021 paper, we have $X_1$ at `axis = 0` and $X_2$ at `axis = 1`.

In [3]:

contingency_table = np.array([
    [0, 0, 20],
    [0, 10, 0],
    [20, 0, 0],
    [0, 10, 0],
    [0, 0, 20]
])
copula = GenericCheckerboardCopula.from_contingency_table(contingency_table)
print(f"Shape of the inferred joint probability matrix P: {copula.P.shape}")
print(f"Probability matrix P:\n{copula.P}")

Shape of the inferred joint probability matrix P: (5, 3)
Probability matrix P:
[[0.    0.    0.25 ]
 [0.    0.125 0.   ]
 [0.25  0.    0.   ]
 [0.    0.125 0.   ]
 [0.    0.    0.25 ]]


### Calculating CCRAM & SCCRAM (non-vectorized)

In [4]:
ccram_X1_to_X2 = copula.calculate_CCRAM(from_axes=[0], to_axis=1)
ccram_X2_to_X1 = copula.calculate_CCRAM(from_axes=[1], to_axis=0)
print(f"CCRAM X1 to X2: {ccram_X1_to_X2:.4f}")
print(f"CCRAM X2 to X1: {ccram_X2_to_X1:.4f}")

sccram_X1_to_X2 = copula.calculate_CCRAM(from_axes=[0], to_axis=1, scaled=True)
sccram_X2_to_X1 = copula.calculate_CCRAM(from_axes=[1], to_axis=0, scaled=True)
print(f"SCCRAM X1 to X2: {sccram_X1_to_X2:.4f}")
print(f"SCCRAM X2 to X1: {sccram_X2_to_X1:.4f}")

CCRAM X1 to X2: 0.8438
CCRAM X2 to X1: 0.0000
SCCRAM X1 to X2: 1.0000
SCCRAM X2 to X1: 0.0000


### Getting Category Predictions

In [5]:
predictions_X1_to_X2 = copula.get_category_predictions_multi(from_axes=[0], to_axis=1)
print("\nPredictions from X1 to X2:")
print(predictions_X1_to_X2)

axis_to_name_dict = {0: "Income Bracket", 1: "Education Level"}
predictions_1_to_0 = copula.get_category_predictions_multi(from_axes=[1], to_axis=0, axis_names=axis_to_name_dict)
print("\nPredictions from axis 1 to axis 0:")
print(predictions_1_to_0)


Predictions from X1 to X2:
   X0 Category  Predicted X1 Category
0            1                      3
1            2                      2
2            3                      1
3            4                      2
4            5                      3

Predictions from axis 1 to axis 0:
   Education Level Category  Predicted Income Bracket Category
0                         1                                  3
1                         2                                  3
2                         3                                  3


### Calculating Scores and their Variances

In [21]:
# Calculate and display scores for both axes
scores_axis0 = copula.calculate_scores(axis=0)
scores_axis1 = copula.calculate_scores(axis=1)

print("Scores for axis 0:")
print(scores_axis0)
# Expected: [0.125, 0.3125, 0.5, 0.6875, 0.875]

print("\nScores for axis 1:")
print(scores_axis1)
# Expected: [0.125, 0.375, 0.75]

# Calculate and display variance of scores
variance_axis0 = copula.calculate_variance_S(axis=0)
variance_axis1 = copula.calculate_variance_S(axis=1)

print("\nVariance of scores for axis 0:", variance_axis0)
# Expected: 81/1024 = 0.0791015625
print("Variance of scores for axis 1:", variance_axis1)
# Expected: 9/128 = 0.0703125 

Scores for axis 0:
[np.float64(0.125), np.float64(0.3125), np.float64(0.5), np.float64(0.6875), np.float64(0.875)]

Scores for axis 1:
[np.float64(0.125), np.float64(0.375), np.float64(0.75)]

Variance of scores for axis 0: 0.0791015625
Variance of scores for axis 1: 0.0703125


# 4-Dimensional Case (Real Data Analysis from JMA2021)

### Create Sample Data in Cases Form and Initialize the GenericCheckerboardCopula

In the case of initialization of a `GenericCheckerboardCopula` object with cases data (`np.array(np.array())`), the axis indexing defaults to the outermost starting with 0. In this case of 4D cases, as mentioned by the `shape` parameter: `axis = 0` has 2 categories, `axis = 1` has 3 categories, `axis = 2` has 2 categories, and `axis = 3` has 6 categories. According to the JMA2021 paper, we have $X_1$ at `axis = 0`, $X_2$ at `axis = 1`, $X_3$ at `axis = 2`, and $X_4$ at `axis = 3`.

In [None]:
real_cases_data = np.array([
    # RDA Row 1
    [0,2,0,1],[0,2,0,4],[0,2,0,4],
    [0,2,0,5], [0,2,0,5],[0,2,0,5],[0,2,0,5],
    # RDA Row 2
    [0,2,1,3],[0,2,1,4],[0,2,1,4],[0,2,1,4],
    # RDA Row 3
    [0,1,0,1],[0,1,0,1],[0,1,0,2],[0,1,0,2],[0,1,0,2],
    [0,1,0,4],[0,1,0,4],[0,1,0,4],[0,1,0,4],[0,1,0,4],[0,1,0,4],
    [0,1,0,5],[0,1,0,5],[0,1,0,5],[0,1,0,5],
    # RDA Row 4
    [0,1,1,1],[0,1,1,3],[0,1,1,3],[0,1,1,5],
    # RDA Row 5
    [0,0,0,4],[0,0,0,4],[0,0,0,5],[0,0,0,5],
    # RDA Row 6
    [0,0,1,2],[0,0,1,3],[0,0,1,4],[0,0,1,4],[0,0,1,4],
    # RDA Row 7
    [1,2,0,2],[1,2,0,2],[1,2,0,2],[1,2,0,4],[1,2,0,5],[1,2,0,5],
    # RDA Row 8
    [1,2,1,1],[1,2,1,4],[1,2,1,4],[1,2,1,4],
    # RDA Row 9
    [1,1,0,1],[1,1,0,1],[1,1,0,1],[1,1,0,2],[1,1,0,2],[1,1,0,2],[1,1,0,2],
    [1,1,0,3],[1,1,0,3],[1,1,0,3],[1,1,0,3],[1,1,0,3],
    [1,1,0,4],[1,1,0,4],[1,1,0,4],[1,1,0,4],[1,1,0,4],[1,1,0,4],
    [1,1,0,5],[1,1,0,5],
    # RDA Row 10
    [1,1,1,0],[1,1,1,1],[1,1,1,1],[1,1,1,1],[1,1,1,1],
    [1,1,1,2],[1,1,1,2],[1,1,1,2],[1,1,1,2],
    [1,1,1,3],[1,1,1,3],[1,1,1,3],[1,1,1,5],
    # RDA Row 11
    [1,0,0,0],[1,0,0,0],[1,0,0,1],[1,0,0,1],[1,0,0,2],
    [1,0,0,3],[1,0,0,3],[1,0,0,3],[1,0,0,3],[1,0,0,3],
    [1,0,0,4],[1,0,0,4],
    # RDA Row 12
    [1,0,1,0],[1,0,1,0],[1,0,1,2],[1,0,1,2],
    [1,0,1,3],[1,0,1,3],[1,0,1,3]
])
rda_copula = GenericCheckerboardCopula.from_cases(cases=real_cases_data, shape=(2,3,2,6))
print(f"Shape of the inferred joint probability matrix P: {rda_copula.P.shape}")
print(f"Probability matrix P:\n{rda_copula.P}\n")
print(f"Marginal pdfs:\n{rda_copula.marginal_pdfs}\n")
print(f"Marginal cdfs:\n{rda_copula.marginal_cdfs}")

Shape of the inferred joint probability matrix P: (2, 3, 2, 6)
Probability matrix P:
[[[[0.         0.         0.         0.         0.01980198 0.01980198]
   [0.         0.         0.00990099 0.00990099 0.02970297 0.        ]]

  [[0.         0.01980198 0.02970297 0.         0.05940594 0.03960396]
   [0.         0.00990099 0.         0.01980198 0.         0.00990099]]

  [[0.         0.00990099 0.         0.         0.01980198 0.03960396]
   [0.         0.         0.         0.00990099 0.02970297 0.        ]]]


 [[[0.01980198 0.01980198 0.00990099 0.04950495 0.01980198 0.        ]
   [0.01980198 0.         0.01980198 0.02970297 0.         0.        ]]

  [[0.         0.02970297 0.03960396 0.04950495 0.05940594 0.01980198]
   [0.00990099 0.03960396 0.03960396 0.02970297 0.         0.00990099]]

  [[0.         0.         0.02970297 0.         0.00990099 0.01980198]
   [0.         0.00990099 0.         0.         0.02970297 0.        ]]]]

Marginal pdfs:
{0: array([0.38613861, 0.6138613

### Calculating CCRAM & SCCRAM (non-vectorized)

In [None]:
rda_ccram_012_to_3 = rda_copula.calculate_CCRAM(from_axes=[0, 1, 2], to_axis=3)
print(f"CCRAM 012->3: {rda_ccram_012_to_3:.4f}")

rda_sccram_012_to_3 = rda_copula.calculate_CCRAM(from_axes=[0, 1, 2], to_axis=3, scaled=True)
print(f"SCCRAM 012->3: {rda_sccram_012_to_3:.4f}")

CCRAM 012->3: 0.2604
SCCRAM 012->3: 0.2716


### Getting Category Predictions

In [25]:
rda_predictions_012_to_3 = rda_copula.get_category_predictions_multi(from_axes=[0, 1, 2], to_axis=3)
print("\nPredictions from axis 012 to axis 3:")
print(rda_predictions_012_to_3)

rda_axis_to_name_dict = {0: "X1", 1: "X2", 2: "X3", 3: "Y = X4"}
rda_predictions_012_to_3_named = rda_copula.get_category_predictions_multi(from_axes=[0, 1, 2], to_axis=3, axis_names=rda_axis_to_name_dict)
print("\nPredictions from X1, X2, X3 to Y = X4:")
print(rda_predictions_012_to_3_named)


Predictions from axis 012 to axis 3:
    X0 Category  X1 Category  X2 Category  Predicted X3 Category
0             1            1            1                      5
1             1            1            2                      5
2             1            2            1                      5
3             1            2            2                      4
4             1            3            1                      5
5             1            3            2                      5
6             2            1            1                      3
7             2            1            2                      3
8             2            2            1                      4
9             2            2            2                      3
10            2            3            1                      4
11            2            3            2                      4

Predictions from X1, X2, X3 to Y = X4:
    X1 Category  X2 Category  X3 Category  Predicted Y = X4 Category
0       

### Calculating Scores and their Variances

In [26]:
# Calculate and display scores for both axes
rda_scores_axis0 = rda_copula.calculate_scores(axis=0)
rda_scores_axis1 = rda_copula.calculate_scores(axis=1)
rda_scores_axis2 = rda_copula.calculate_scores(axis=2)
rda_scores_axis3 = rda_copula.calculate_scores(axis=3)

print("Scores for axis 0:")
print(rda_scores_axis0)
print("\nScores for axis 1:")
print(rda_scores_axis1)
print("\nScores for axis 2:")
print(rda_scores_axis2)
print("\nScores for axis 3:")
print(rda_scores_axis3)

# Calculate and display variance of scores
rda_variance_axis0 = rda_copula.calculate_variance_S(axis=0)
rda_variance_axis1 = rda_copula.calculate_variance_S(axis=1)
rda_variance_axis2 = rda_copula.calculate_variance_S(axis=2)
rda_variance_axis3 = rda_copula.calculate_variance_S(axis=3)

print("\nVariance of scores for axis 0:", rda_variance_axis0)
print("\nVariance of scores for axis 1:", rda_variance_axis1)
print("\nVariance of scores for axis 2:", rda_variance_axis2)
print("\nVariance of scores for axis 3:", rda_variance_axis3)
# Expected 12 * (variance of scores for axis 3): 0.07987568681385342*12 = 0.95850824176

Scores for axis 0:
[np.float64(0.19306930693069307), np.float64(0.693069306930693)]

Scores for axis 1:
[np.float64(0.13861386138613863), np.float64(0.5346534653465347), np.float64(0.8960396039603961)]

Scores for axis 2:
[np.float64(0.3168316831683168), np.float64(0.8168316831683167)]

Scores for axis 3:
[np.float64(0.024752475247524754), np.float64(0.1188118811881188), np.float64(0.27722772277227725), np.float64(0.4653465346534653), np.float64(0.7029702970297029), np.float64(0.9207920792079207)]

Variance of scores for axis 0: 0.059258896186648376

Variance of scores for axis 1: 0.0694360191827437

Variance of scores for axis 2: 0.0580335261248897

Variance of scores for axis 3: 0.07987568681385342
