In [188]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

spdata = pd.read_pickle('SP.pkl')
mpdata = pd.read_pickle('MP.pkl')

In [189]:
spdata.columns

Index(['Patient ID', 'Race recode (W, B, AI, API)',
       'Race recode (White, Black, Other)', 'Year of diagnosis',
       'Site recode ICD-O-3/WHO 2008', 'Primary Site - labeled',
       'Histologic Type ICD-O-3', 'Grade', 'Laterality',
       'ICD-O-3 Hist/behav, malignant',
       'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'Derived AJCC T, 7th ed (2010-2015)',
       'Derived AJCC N, 7th ed (2010-2015)',
       'Derived AJCC M, 7th ed (2010-2015)',
       'Derived SEER Cmb Stg Grp (2016+)', 'Derived SEER Combined T (2016+)',
       'Derived SEER Combined N (2016+)', 'Derived SEER Combined M (2016+)',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation sequence with surgery',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)',
       'SEER Combined Mets at DX-bone (2010+)',
       'SEER Combined Mets at DX-brain (2010+)',
       'SEER Combined Mets at DX-liver (2010+)',
       'SEER Combined Mets at DX-lung (2010+)', 'Breast Subtype (2010+)',
       'ER Stat

In [190]:
mpdata.columns

Index(['Patient ID', 'Record number recode',
       'Site recode B ICD-O-3/WHO 2008 (Event Variable)', 'Year of diagnosis',
       'Race recode (W, B, AI, API)', 'Site recode ICD-O-3/WHO 2008',
       'Age recode with <1 year olds', 'Age at diagnosis',
       'Primary Site - labeled', 'Histologic Type ICD-O-3', 'Grade',
       'Laterality', 'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'Derived AJCC T, 7th ed (2010-2015)',
       'Derived AJCC N, 7th ed (2010-2015)',
       'Derived AJCC M, 7th ed (2010-2015)',
       'Derived SEER Cmb Stg Grp (2016+)', 'Derived SEER Combined T (2016+)',
       'Derived SEER Combined N (2016+)', 'Derived SEER Combined M (2016+)',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation sequence with surgery',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)',
       'Tumor Size Summary (2016+)', 'CS tumor size (2004-2015)',
       'SEER Combined Mets at DX-bone (2010+)',
       'SEER Combined Mets at DX-brain (2010+)',
       'SEER Co

#### 2.3.7 'Primary Site - labeled'

In [191]:
spdata['Primary Site - labeled'].unique()

array(['C50.8-Overlapping lesion of breast',
       'C50.1-Central portion of breast',
       'C50.2-Upper-inner quadrant of breast',
       'C50.3-Lower-inner quadrant of breast',
       'C50.4-Upper-outer quadrant of breast', 'C50.9-Breast, NOS',
       'C50.5-Lower-outer quadrant of breast', 'C50.0-Nipple',
       'C50.6-Axillary tail of breast'], dtype=object)

In [192]:
mpdata[mpdata['Record number recode']==1]['Primary Site - labeled'].unique()

array(['C50.9-Breast, NOS', 'C50.4-Upper-outer quadrant of breast',
       'C50.2-Upper-inner quadrant of breast',
       'C50.1-Central portion of breast',
       'C50.8-Overlapping lesion of breast',
       'C50.3-Lower-inner quadrant of breast',
       'C50.5-Lower-outer quadrant of breast',
       'C50.6-Axillary tail of breast', 'C50.0-Nipple'], dtype=object)

In [193]:
spdata.groupby('Primary Site - labeled').size()

Primary Site - labeled
C50.0-Nipple                              1323
C50.1-Central portion of breast          14344
C50.2-Upper-inner quadrant of breast     38222
C50.3-Lower-inner quadrant of breast     17149
C50.4-Upper-outer quadrant of breast    107245
C50.5-Lower-outer quadrant of breast     23363
C50.6-Axillary tail of breast             1528
C50.8-Overlapping lesion of breast       71496
C50.9-Breast, NOS                        40477
dtype: int64

In [194]:
mpdata[mpdata['Record number recode']==1].groupby('Primary Site - labeled').size()

Primary Site - labeled
C50.0-Nipple                              52
C50.1-Central portion of breast          427
C50.2-Upper-inner quadrant of breast    1010
C50.3-Lower-inner quadrant of breast     471
C50.4-Upper-outer quadrant of breast    2733
C50.5-Lower-outer quadrant of breast     541
C50.6-Axillary tail of breast             38
C50.8-Overlapping lesion of breast      1959
C50.9-Breast, NOS                       1057
dtype: int64

---

**解析**

1. 位置上未见区别，顺序为 外上 -> 内上 -> 内下/外下

---

#### 2.3.8 'Histologic Type ICD-O-3'

In [195]:
spdata['Histologic Type ICD-O-3'].unique()

array([8522, 8520, 8500, 9120, 8140, 8480, 8211, 8523, 8543, 8246, 8502,
       8490, 8050, 8524, 8510, 8575, 8010, 8530, 8504, 8507, 8201, 9020,
       8503, 8260, 8200, 8343, 8401, 8541, 8013, 8800, 8022, 8481, 8070,
       8230, 8560, 8249, 8046, 8521, 8123, 8890, 8540, 8501, 8980, 8323,
       8453, 8000, 8574, 8570, 8344, 8550, 8255, 8830, 8982, 8035, 8074,
       8033, 8525, 8513, 8572, 8147, 8935, 8041, 8310, 8032, 9180, 8141,
       8315, 8801, 8045, 8571, 8071, 8802, 8082, 8805, 8512, 8983, 8811,
       8430, 8004, 9041, 8240, 8470, 8072, 8020, 8001, 8410, 9040, 8021,
       8851, 8005, 8251, 8858, 8850, 9260, 8852, 8832, 9580, 8900, 8854,
       8130, 8090, 8042, 8012, 8573, 8290, 8243, 8471, 8815, 8562, 8804,
       8030, 9183, 8912, 8341, 8460, 8542, 8990, 8052, 8810, 8440, 8720,
       8920], dtype=int64)

In [196]:
mpdata[mpdata['Record number recode']==1]['Histologic Type ICD-O-3'].unique()

array([8500, 8520, 8480, 8504, 8507, 8575, 8523, 8522, 8046, 8211, 8530,
       8010, 8260, 8543, 9020, 8521, 8510, 8050, 8140, 8524, 8255, 8201,
       8503, 8200, 8540, 8560, 8401, 8070, 8000, 8032, 8501, 8541, 8502,
       8246, 9181, 8574, 8230, 8041, 8490, 8310, 8800, 8832, 9180, 8343,
       8071, 8980, 9120, 8550, 8801, 8802, 8033, 8481, 8141, 8470, 8940,
       8035], dtype=int64)

In [197]:
spdata.groupby('Histologic Type ICD-O-3').size().sort_values(ascending=False).head(10)

Histologic Type ICD-O-3
8500    236433
8520     27572
8522     16542
8523      9998
8480      5725
8010      2747
8140      1830
8211      1574
8575      1382
8507      1262
dtype: int64

In [198]:
mpdata[mpdata['Record number recode']==1].groupby('Histologic Type ICD-O-3').size().sort_values(ascending=False).head(10)

Histologic Type ICD-O-3
8500    5998
8520     764
8522     470
8523     291
8480     207
8010      65
8211      52
8575      43
8140      37
8503      36
dtype: int64

---

**解析**

1. 组织学类型前5名均为 8500 -> 8520 -> 8522 -> 8523 -> 8480

---

#### 2.3.9 'Grade'

In [199]:
spdata['Grade'].unique()

array(['Moderately differentiated; Grade II',
       'Poorly differentiated; Grade III', 'Well differentiated; Grade I',
       'Unknown', 'Undifferentiated; anaplastic; Grade IV'], dtype=object)

In [200]:
mpdata[mpdata['Record number recode']==1]['Grade'].unique()

array(['Poorly differentiated; Grade III',
       'Moderately differentiated; Grade II',
       'Well differentiated; Grade I', 'Unknown',
       'Undifferentiated; anaplastic; Grade IV'], dtype=object)

In [201]:
spdata.groupby('Grade').size().sort_values(ascending=False)

Grade
Moderately differentiated; Grade II       130554
Poorly differentiated; Grade III           98754
Well differentiated; Grade I               67025
Unknown                                    17801
Undifferentiated; anaplastic; Grade IV      1013
dtype: int64

In [202]:
mpdata[mpdata['Record number recode']==1].groupby('Grade').size().sort_values(ascending=False)

Grade
Moderately differentiated; Grade II       3421
Poorly differentiated; Grade III          2467
Well differentiated; Grade I              1911
Unknown                                    459
Undifferentiated; anaplastic; Grade IV      30
dtype: int64

In [203]:
sgrade = spdata.groupby('Grade').size().sort_values(ascending=False)
mgrade = mpdata[mpdata['Record number recode']==1].groupby('Grade').size().sort_values(ascending=False)

In [204]:
obs = np.array([sgrade, mgrade]).T
print(obs)
chi2_contingency(obs)

[[130554   3421]
 [ 98754   2467]
 [ 67025   1911]
 [ 17801    459]
 [  1013     30]]


(19.123676217914262,
 0.0007431662505579445,
 4,
 array([[1.30541900e+05, 3.43310031e+03],
        [9.86272187e+04, 2.59378128e+03],
        [6.71695197e+04, 1.76648034e+03],
        [1.77920887e+04, 4.67911265e+02],
        [1.01627320e+03, 2.67268045e+01]]))

In [205]:
tag = ['II','III','I','UN','IV']
for row in range(len(obs)):
    # test = np.array([obs[0], obs[1]])
    for row2 in range(row+1, len(obs)):
        test = np.array([obs[row], obs[row2]])
        print(tag[row], tag[row2], round(chi2_contingency(test)[1], 3))

II III 0.076
II I 0.004
II UN 0.768
II IV 0.576
III I 0.0
III UN 0.556
III IV 0.416
I UN 0.06
I IV 0.914
UN IV 0.533
