In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

spdata = pd.read_pickle('SP.pkl')
mpdata = pd.read_pickle('MP.pkl')

In [2]:
spdata.columns

Index(['Patient ID', 'Race recode (W, B, AI, API)',
       'Race recode (White, Black, Other)', 'Year of diagnosis',
       'Site recode ICD-O-3/WHO 2008', 'Primary Site - labeled',
       'Histologic Type ICD-O-3', 'Grade', 'Laterality',
       'ICD-O-3 Hist/behav, malignant',
       'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'Derived AJCC T, 7th ed (2010-2015)',
       'Derived AJCC N, 7th ed (2010-2015)',
       'Derived AJCC M, 7th ed (2010-2015)',
       'Derived SEER Cmb Stg Grp (2016+)', 'Derived SEER Combined T (2016+)',
       'Derived SEER Combined N (2016+)', 'Derived SEER Combined M (2016+)',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation sequence with surgery',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)',
       'SEER Combined Mets at DX-bone (2010+)',
       'SEER Combined Mets at DX-brain (2010+)',
       'SEER Combined Mets at DX-liver (2010+)',
       'SEER Combined Mets at DX-lung (2010+)', 'Breast Subtype (2010+)',
       'ER Stat

In [3]:
mpdata.columns

Index(['Patient ID', 'Record number recode',
       'Site recode B ICD-O-3/WHO 2008 (Event Variable)', 'Year of diagnosis',
       'Race recode (W, B, AI, API)', 'Site recode ICD-O-3/WHO 2008',
       'Age recode with <1 year olds', 'Age at diagnosis',
       'Primary Site - labeled', 'Histologic Type ICD-O-3', 'Grade',
       'Laterality', 'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'Derived AJCC T, 7th ed (2010-2015)',
       'Derived AJCC N, 7th ed (2010-2015)',
       'Derived AJCC M, 7th ed (2010-2015)',
       'Derived SEER Cmb Stg Grp (2016+)', 'Derived SEER Combined T (2016+)',
       'Derived SEER Combined N (2016+)', 'Derived SEER Combined M (2016+)',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation sequence with surgery',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)',
       'Tumor Size Summary (2016+)', 'CS tumor size (2004-2015)',
       'SEER Combined Mets at DX-bone (2010+)',
       'SEER Combined Mets at DX-brain (2010+)',
       'SEER Co

In [4]:
def highlight_diff(rowdata):
    return ['background-color: darkgreen' if rowdata['p'] < 0.05 else '' for v in rowdata]

In [19]:
def columninfo(tag, spdata, mpdata):

    print('='*90)

    # 分项统计
    svalue = spdata.groupby(tag).size().sort_values(ascending=False)
    print('spdata:', svalue)
    print('-'*90)    
    mvalue = mpdata[mpdata['Record number recode']==1].groupby(tag).size().sort_values(ascending=False)
    print('mpdata:', mvalue)
    print('-'*90)

    # 项目列表
    snunique = spdata[tag].nunique()
    mnunique = mpdata[mpdata['Record number recode']==1][tag].nunique()
    print(f'spdata: {snunique}, mpdata: {mnunique}')
    sunique = spdata[tag].unique()
    munique = mpdata[mpdata['Record number recode']==1][tag].unique()
    index = list(set(sunique).intersection(set(munique)))
    diff = list(set(sunique).difference(set(munique)))
    print('difference: ', diff)
    print('*'*90)

    # 同项目卡方检验，排序时用'0'，排序后换回0
    obs = np.array([svalue[index], mvalue[index]]).T

    obsframe = pd.DataFrame(index=index, columns=['svalue','mvalue','percent'])
    obsframe['svalue'] = obs[:,0]
    obsframe['mvalue'] = obs[:,1]
    obsframe['percent'] = round(obsframe['mvalue']/(obsframe['svalue']+obsframe['mvalue']), 3)
    print(obsframe.sort_values(by='percent', ascending=False))
    
    # 整体差异性
    print('p = ', round(chi2_contingency(obs)[1], 3))
    print('-'*90)

    # 组间两两比较
    result = pd.DataFrame()
    result_type1 = []
    result_percent1 = []
    result_type2 = []
    result_percent2 = []
    result_p = []
    for type1 in range(len(obs)):
        for type2 in range(type1+1, len(obs)):
            result_type1.append(index[type1])
            result_percent1.append(obsframe.loc[index[type1], 'percent'])
            result_type2.append(index[type2])
            result_percent2.append(obsframe.loc[index[type2], 'percent'])
            result_p.append(round(chi2_contingency(np.array([obs[type1], obs[type2]]))[1],3))
    result['type1'] = result_type1
    result['percent1'] = result_percent1
    result['type2'] = result_type2
    result['percent2'] = result_percent2
    result['p'] = result_p

    return result.style.apply(highlight_diff, axis=1).format({'p': '{:.3f}', 'percent1': '{:.3f}', 'percent2': '{:.3f}'})

#### 2.3.7 'Primary Site - labeled'

In [20]:
columninfo('Primary Site - labeled', spdata, mpdata)

spdata: Primary Site - labeled
C50.4-Upper-outer quadrant of breast    107245
C50.8-Overlapping lesion of breast       71496
C50.9-Breast, NOS                        40477
C50.2-Upper-inner quadrant of breast     38222
C50.5-Lower-outer quadrant of breast     23363
C50.3-Lower-inner quadrant of breast     17149
C50.1-Central portion of breast          14344
C50.6-Axillary tail of breast             1528
C50.0-Nipple                              1323
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Primary Site - labeled
C50.4-Upper-outer quadrant of breast    2733
C50.8-Overlapping lesion of breast      1959
C50.9-Breast, NOS                       1057
C50.2-Upper-inner quadrant of breast    1010
C50.5-Lower-outer quadrant of breast     541
C50.3-Lower-inner quadrant of breast     471
C50.1-Central portion of breast          427
C50.0-Nipple                              52
C50.6-Axillary tail of breast             38
dtype:

Unnamed: 0,type1,percent1,type2,percent2,p
0,C50.0-Nipple,0.038,C50.5-Lower-outer quadrant of breast,0.023,0.0
1,C50.0-Nipple,0.038,C50.8-Overlapping lesion of breast,0.027,0.014
2,C50.0-Nipple,0.038,C50.6-Axillary tail of breast,0.024,0.043
3,C50.0-Nipple,0.038,C50.2-Upper-inner quadrant of breast,0.026,0.008
4,C50.0-Nipple,0.038,C50.4-Upper-outer quadrant of breast,0.025,0.003
5,C50.0-Nipple,0.038,C50.1-Central portion of breast,0.029,0.075
6,C50.0-Nipple,0.038,"C50.9-Breast, NOS",0.025,0.006
7,C50.0-Nipple,0.038,C50.3-Lower-inner quadrant of breast,0.027,0.02
8,C50.5-Lower-outer quadrant of breast,0.023,C50.8-Overlapping lesion of breast,0.027,0.001
9,C50.5-Lower-outer quadrant of breast,0.023,C50.6-Axillary tail of breast,0.024,0.739


---

**解析**

1. 位置上未见区别，顺序为 外上 -> 内上 -> 内下/外下

---

#### 2.3.8 'Histologic Type ICD-O-3'

In [21]:
columninfo('Histologic Type ICD-O-3', spdata, mpdata)

spdata: Histologic Type ICD-O-3
8500    236433
8520     27572
8522     16542
8523      9998
8480      5725
         ...  
8990         1
8344         1
8243         1
9040         1
8410         1
Length: 122, dtype: int64
------------------------------------------------------------------------------------------
mpdata: Histologic Type ICD-O-3
8500    5998
8520     764
8522     470
8523     291
8480     207
8010      65
8211      52
8575      43
8140      37
8503      36
8530      33
8507      32
8524      25
9020      24
8050      21
8510      21
8401      19
8541      16
8201      13
8200      12
8543      10
8504       9
8521       8
8000       8
8230       7
8501       7
8540       7
8260       6
8560       4
8032       4
8070       3
8255       3
8343       3
8033       3
9120       3
8046       2
8246       2
8502       2
8041       1
8141       1
8980       1
8035       1
8071       1
8940       1
8801       1
8832       1
8802       1
8800       1
8310       1
8574       1
8470

Unnamed: 0,type1,percent1,type2,percent2,p
0,8832,0.167,8070,0.029,0.532
1,8832,0.167,8071,0.037,0.796
2,8832,0.167,8200,0.048,0.711
3,8832,0.167,8201,0.023,0.342
4,8832,0.167,8211,0.032,0.481
5,8832,0.167,8980,0.014,0.352
6,8832,0.167,8470,0.5,1.0
7,8832,0.167,8343,0.044,0.741
8,8832,0.167,8480,0.035,0.52
9,8832,0.167,9120,0.048,0.789


---

**解析**

1. 组织学类型前5名均为 8500 -> 8520 -> 8522 -> 8523 -> 8480

---

#### 2.3.9 'Grade'

In [22]:
columninfo('Grade', spdata, mpdata)

spdata: Grade
Moderately differentiated; Grade II       130554
Poorly differentiated; Grade III           98754
Well differentiated; Grade I               67025
Unknown                                    17801
Undifferentiated; anaplastic; Grade IV      1013
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Grade
Moderately differentiated; Grade II       3421
Poorly differentiated; Grade III          2467
Well differentiated; Grade I              1911
Unknown                                    459
Undifferentiated; anaplastic; Grade IV      30
dtype: int64
------------------------------------------------------------------------------------------
spdata: 5, mpdata: 5
difference:  []
******************************************************************************************
                                        svalue  mvalue  percent
Undifferentiated; anaplastic; Grade IV    1013      30    0.029
Well differentiated; Grade I

Unnamed: 0,type1,percent1,type2,percent2,p
0,Undifferentiated; anaplastic; Grade IV,0.029,Moderately differentiated; Grade II,0.026,0.576
1,Undifferentiated; anaplastic; Grade IV,0.029,Poorly differentiated; Grade III,0.024,0.416
2,Undifferentiated; anaplastic; Grade IV,0.029,Well differentiated; Grade I,0.028,0.914
3,Undifferentiated; anaplastic; Grade IV,0.029,Unknown,0.025,0.533
4,Moderately differentiated; Grade II,0.026,Poorly differentiated; Grade III,0.024,0.076
5,Moderately differentiated; Grade II,0.026,Well differentiated; Grade I,0.028,0.004
6,Moderately differentiated; Grade II,0.026,Unknown,0.025,0.768
7,Poorly differentiated; Grade III,0.024,Well differentiated; Grade I,0.028,0.0
8,Poorly differentiated; Grade III,0.024,Unknown,0.025,0.556
9,Well differentiated; Grade I,0.028,Unknown,0.025,0.06


---

**解析**

1. Grade I 与 Grade II/III 在单原发与多原发癌中存在统计学差异性，其出现多原发癌的可能性略高些

2. 原因为 XXX

---

#### 2.3.10 'Laterality'

In [23]:
columninfo('Laterality', spdata, mpdata)

spdata: Laterality
Left - origin of primary                                 159512
Right - origin of primary                                154258
Paired site, but no information concerning laterality      1103
Only one side - side unspecified                            144
Bilateral, single primary                                   130
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Laterality
Left - origin of primary                                 4225
Right - origin of primary                                4052
Paired site, but no information concerning laterality      11
dtype: int64
------------------------------------------------------------------------------------------
spdata: 5, mpdata: 3
difference:  ['Only one side - side unspecified', 'Bilateral, single primary']
******************************************************************************************
                                                    svalu

Unnamed: 0,type1,percent1,type2,percent2,p
0,"Paired site, but no information concerning laterality",0.01,Right - origin of primary,0.026,0.001
1,"Paired site, but no information concerning laterality",0.01,Left - origin of primary,0.026,0.001
2,Right - origin of primary,0.026,Left - origin of primary,0.026,0.717


---

**解析**

1. 双原发癌初始双侧癌仅11例，无意义

2. 初始左右侧癌无统计学意义

---

#### 2.3.11 Derived AJCC Stage Group, 7th ed (2010-2015)

In [27]:
columninfo('Derived AJCC Stage Group, 7th ed (2010-2015)', spdata, mpdata)

spdata: Derived AJCC Stage Group, 7th ed (2010-2015)
IA           121617
IIA           56615
Blank(s)      49252
IIB           30092
IIIA          16809
IV            14511
UNK Stage      7265
IIIC           6747
IB             5860
IIIB           5540
IIINOS          351
0               307
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived AJCC Stage Group, 7th ed (2010-2015)
IA           4029
IIA          1787
IIB           812
IIIA          466
IV            274
IIIC          265
UNK Stage     191
IIIB          190
IB            176
Blank(s)       59
0              17
IIINOS         13
dtype: int64
------------------------------------------------------------------------------------------
spdata: 12, mpdata: 12
difference:  [nan]
******************************************************************************************
           svalue  mvalue  percent
0             307      17    0.052
IIIC         6747     265  

Unnamed: 0,type1,percent1,type2,percent2,p
0,0,0.052,IA,0.032,0.055
1,0,0.052,UNK Stage,0.026,0.006
2,0,0.052,IIINOS,0.036,0.375
3,0,0.052,IIIA,0.027,0.009
4,0,0.052,IIIB,0.033,0.088
5,0,0.052,IIB,0.026,0.006
6,0,0.052,Blank(s),0.001,0.0
7,0,0.052,IV,0.019,0.0
8,0,0.052,IIIC,0.038,0.232
9,0,0.052,IIA,0.031,0.035


---

**解析**

1. IIIB IIIC 期患者更容易出现第二原发癌

2. IV 几率不高，可能死亡率高

3. IA 期0.032，可能存活期较长

---

#### 2.3.12 Derived AJCC T, 7th ed (2010-2015)

In [28]:
columninfo('Derived AJCC T, 7th ed (2010-2015)', spdata, mpdata)

spdata: Derived AJCC T, 7th ed (2010-2015)
T1c         84596
T2          79164
Blank(s)    49252
T1b         42215
T3          16785
T1a         16748
TX           8621
T4b          6031
T1mic        4855
T4d          3286
T4a          1352
T0            669
T1NOS         406
T4c           343
T4NOS         336
Tis           307
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived AJCC T, 7th ed (2010-2015)
T1c         2733
T2          2345
T1b         1361
T1a          564
T3           498
TX           199
T4b          171
T1mic        147
T4d          105
Blank(s)      59
T4a           29
Tis           17
T1NOS         17
T0            17
T4NOS          9
T4c            8
dtype: int64
------------------------------------------------------------------------------------------
spdata: 16, mpdata: 16
difference:  [nan]
******************************************************************************************
          sva

Unnamed: 0,type1,percent1,type2,percent2,p
0,TX,0.023,T0,0.025,0.808
1,TX,0.023,T4c,0.023,0.877
2,TX,0.023,T3,0.029,0.003
3,TX,0.023,T1NOS,0.04,0.029
4,TX,0.023,T4b,0.028,0.058
5,TX,0.023,T1c,0.031,0.0
6,TX,0.023,Blank(s),0.001,0.0
7,TX,0.023,T1b,0.031,0.0
8,TX,0.023,T1mic,0.029,0.016
9,TX,0.023,T4NOS,0.026,0.805


### 2.3.13 Derived AJCC N, 7th ed (2010-2015)

In [29]:
columninfo('Derived AJCC N, 7th ed (2010-2015)', spdata, mpdata)

spdata: Derived AJCC N, 7th ed (2010-2015)
N0          104464
N0(i-)       63943
Blank(s)     49252
N1a          32934
N1           16330
N2a          12090
N1mi         11091
N3a           5840
N0(i+)        5587
NX            5391
N1NOS         2565
N2NOS         1625
N3c           1278
N3b           1254
N3NOS          443
N0(mol-)       386
N2b            253
N1c            124
N1b             76
N0(mol+)        40
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived AJCC N, 7th ed (2010-2015)
N0          3412
N0(i-)      2096
N1a          944
N2a          341
N1mi         334
N1           320
N3a          232
N0(i+)       188
NX           120
N1NOS         72
Blank(s)      59
N3c           43
N2NOS         42
N3b           31
N0(mol-)      17
N3NOS         15
N2b            6
N1c            6
N0(mol+)       1
dtype: int64
------------------------------------------------------------------------------------------
spd

Unnamed: 0,type1,percent1,type2,percent2,p
0,N1a,0.028,N2b,0.023,0.788
1,N1a,0.028,N1mi,0.029,0.464
2,N1a,0.028,N3b,0.024,0.475
3,N1a,0.028,N1,0.019,0.0
4,N1a,0.028,N0(mol+),0.024,0.734
5,N1a,0.028,N0(mol-),0.042,0.114
6,N1a,0.028,NX,0.022,0.011
7,N1a,0.028,N1NOS,0.027,0.915
8,N1a,0.028,N0,0.032,0.0
9,N1a,0.028,N0(i-),0.032,0.001


#### 2.3.14 Derived AJCC M, 7th ed (2010-2015)

In [30]:
columninfo('Derived AJCC M, 7th ed (2010-2015)', spdata, mpdata)

spdata: Derived AJCC M, 7th ed (2010-2015)
M0          251164
Blank(s)     49252
M1           14511
M0(i+)          39
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived AJCC M, 7th ed (2010-2015)
M0          7946
M1           274
Blank(s)      59
dtype: int64
------------------------------------------------------------------------------------------
spdata: 4, mpdata: 3
difference:  [nan, 'M0(i+)']
******************************************************************************************
          svalue  mvalue  percent
M0        251164    7946    0.031
M1         14511     274    0.019
Blank(s)   49252      59    0.001
p =  0.0
------------------------------------------------------------------------------------------


Unnamed: 0,type1,percent1,type2,percent2,p
0,M1,0.019,Blank(s),0.001,0.0
1,M1,0.019,M0,0.031,0.0
2,Blank(s),0.001,M0,0.031,0.0


#### 2.3.15 Derived SEER Cmb Stg Grp (2016+)

In [31]:
columninfo('Derived SEER Cmb Stg Grp (2016+)', spdata, mpdata)

spdata: Derived SEER Cmb Stg Grp (2016+)
Blank(s)          265895
1A                 22985
2A                 10993
2B                  5404
3A                  2836
4                   2688
3B                  1165
99                  1120
3C                  1039
1B                   860
0                     82
3                     51
Not applicable        29
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived SEER Cmb Stg Grp (2016+)
Blank(s)    8229
1A            22
2A            18
2B             6
4              5
1B             3
3A             2
3C             1
3B             1
99             1
dtype: int64
------------------------------------------------------------------------------------------
spdata: 13, mpdata: 10
difference:  [0, 'Not applicable', 3]
******************************************************************************************
          svalue  mvalue  percent
Blank(s)  265895    8229    0

Unnamed: 0,type1,percent1,type2,percent2,p
0,99,0.001,4,0.002,0.813
1,99,0.001,3A,0.001,0.654
2,99,0.001,3B,0.001,0.497
3,99,0.001,Blank(s),0.03,0.0
4,99,0.001,2B,0.001,0.765
5,99,0.001,3C,0.001,0.513
6,99,0.001,1B,0.003,0.443
7,99,0.001,2A,0.002,0.839
8,99,0.001,1A,0.001,0.669
9,4,0.002,3A,0.001,0.409


#### 2.3.16 Derived SEER Combined T (2016+) N M

In [32]:
columninfo('Derived SEER Combined T (2016+)', spdata, mpdata)

spdata: Derived SEER Combined T (2016+)
Blank(s)          266264
p1C                13638
p2                  9852
p1B                 7375
c2                  5017
p1A                 3199
c3                  1677
c1C                 1614
p3                  1341
cX                   859
p1MI                 785
c4B                  696
c4D                  551
c1B                  470
c4                   412
p4B                  238
p1                   201
c4A                  157
c1                   157
c1A                  129
pIS                   96
c0                    87
c4C                   80
p4                    62
p4D                   50
p4A                   38
Not applicable        29
c1MI                  28
pX                    26
p4C                   14
p0                     5
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived SEER Combined T (2016+)
Blank(s)    8230
p1C           17
p2     

Unnamed: 0,type1,percent1,type2,percent2,p
0,p4B,0.004,p1C,0.001,0.73
1,p4B,0.004,c1C,0.001,0.845
2,p4B,0.004,p1B,0.001,0.735
3,p4B,0.004,p2,0.001,0.805
4,p4B,0.004,c4D,0.002,0.872
5,p4B,0.004,p1MI,0.001,0.955
6,p4B,0.004,c3,0.001,0.591
7,p4B,0.004,Blank(s),0.03,0.032
8,p4B,0.004,c2,0.002,0.884
9,p4B,0.004,p1A,0.001,0.662


In [33]:
columninfo('Derived SEER Combined N (2016+)', spdata, mpdata)

spdata: Derived SEER Combined N (2016+)
Blank(s)          266264
p0                 16555
p0I-               10179
p1A                 5517
c0                  5437
c1                  2856
p2A                 1629
p1MI                1624
p0I+                 922
p1                   882
cX                   721
p3A                  683
c2                   336
c3C                  232
p2                   203
c3                   180
c3B                  175
c2A                  173
p1C                  126
pX                    88
p3                    74
p0M-                  58
c3A                   54
c2B                   44
p1B                   34
p3B                   29
Not applicable        29
p3C                   16
p2B                   16
p0M+                  11
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived SEER Combined N (2016+)
Blank(s)    8230
p0            20
p1A           10
p0I-           

Unnamed: 0,type1,percent1,type2,percent2,p
0,p2A,0.001,c3B,0.006,0.467
1,p2A,0.001,p1MI,0.003,0.22
2,p2A,0.001,c3,0.006,0.479
3,p2A,0.001,Blank(s),0.03,0.0
4,p2A,0.001,c0,0.001,0.91
5,p2A,0.001,c2,0.006,0.131
6,p2A,0.001,p0I-,0.001,0.803
7,p2A,0.001,c1,0.001,0.961
8,p2A,0.001,cX,0.001,0.861
9,p2A,0.001,p3A,0.001,0.888


In [34]:
columninfo('Derived SEER Combined M (2016+)', spdata, mpdata)

spdata: Derived SEER Combined M (2016+)
Blank(s)          266264
c0                 46158
c1                  1719
p1                   960
Not applicable        29
c0I+                  17
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Derived SEER Combined M (2016+)
Blank(s)    8230
c0            53
c1             5
dtype: int64
------------------------------------------------------------------------------------------
spdata: 6, mpdata: 3
difference:  ['Not applicable', 'p1', 'c0I+']
******************************************************************************************
          svalue  mvalue  percent
Blank(s)  266264    8230    0.030
c1          1719       5    0.003
c0         46158      53    0.001
p =  0.0
------------------------------------------------------------------------------------------


Unnamed: 0,type1,percent1,type2,percent2,p
0,Blank(s),0.03,c1,0.003,0.0
1,Blank(s),0.03,c0,0.001,0.0
2,c1,0.003,c0,0.001,0.089


#### 2.3.17 RX Summ--Surg Prim Site (1998+)

In [35]:
columninfo('RX Summ--Surg Prim Site (1998+)', spdata, mpdata)

spdata: RX Summ--Surg Prim Site (1998+)
22    109609
0      29299
23     28526
41     27566
51     24846
20     17576
24     11892
42      8935
49      6336
52      6057
30      4791
48      4169
45      4139
40      4117
44      3586
47      2830
59      2367
50      2259
43      1806
75      1681
46      1612
55      1540
58      1377
54      1297
21      1254
57       951
99       899
63       680
56       648
53       634
80       596
61       393
90       283
62       168
60        75
64        46
69        37
68        37
65        36
76        35
19        31
72        28
74        25
66        24
73        18
67        15
71        13
70         8
dtype: int64
------------------------------------------------------------------------------------------
mpdata: RX Summ--Surg Prim Site (1998+)
22    3062
51     902
23     901
41     856
20     490
0      456
24     372
42     156
40     121
52     102
45      92
44      83
50      83
49      80
48      66
46      50
43      41
30   

Unnamed: 0,type1,percent1,type2,percent2,p
0,0,0.015,20,0.027,0.0
1,0,0.015,21,0.025,0.01
2,0,0.015,22,0.027,0.0
3,0,0.015,23,0.031,0.0
4,0,0.015,24,0.03,0.0
5,0,0.015,30,0.008,0.0
6,0,0.015,40,0.029,0.0
7,0,0.015,41,0.03,0.0
8,0,0.015,42,0.017,0.237
9,0,0.015,43,0.022,0.027


#### 2.3.18 Radiation sequence with surgery

In [36]:
columninfo('Radiation sequence with surgery', spdata, mpdata)

spdata: Radiation sequence with surgery
Radiation after surgery                                   156735
No radiation and/or cancer-directed surgery               154712
Intraoperative radiation                                    1729
Radiation prior to surgery                                   766
Intraoperative rad with other rad before/after surgery       516
Radiation before and after surgery                           396
Sequence unknown, but both were given                        210
Surgery both before and after radiation                       83
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Radiation sequence with surgery
Radiation after surgery                                   4313
No radiation and/or cancer-directed surgery               3880
Intraoperative radiation                                    42
Radiation prior to surgery                                  24
Intraoperative rad with other rad before/aft

Unnamed: 0,type1,percent1,type2,percent2,p
0,Radiation prior to surgery,0.03,"Sequence unknown, but both were given",0.023,0.746
1,Radiation prior to surgery,0.03,No radiation and/or cancer-directed surgery,0.024,0.338
2,Radiation prior to surgery,0.03,Intraoperative rad with other rad before/after surgery,0.023,0.507
3,Radiation prior to surgery,0.03,Radiation after surgery,0.027,0.607
4,Radiation prior to surgery,0.03,Intraoperative radiation,0.024,0.396
5,Radiation prior to surgery,0.03,Surgery both before and after radiation,0.035,0.921
6,Radiation prior to surgery,0.03,Radiation before and after surgery,0.022,0.53
7,"Sequence unknown, but both were given",0.023,No radiation and/or cancer-directed surgery,0.024,0.915
8,"Sequence unknown, but both were given",0.023,Intraoperative rad with other rad before/after surgery,0.023,0.821
9,"Sequence unknown, but both were given",0.023,Radiation after surgery,0.027,0.914


#### 2.3.19 Radiation recode

In [37]:
columninfo('Radiation recode', spdata, mpdata)

spdata: Radiation recode
Beam radiation                                           154379
None/Unknown                                             135199
Recommended, unknown if administered                      10977
Radioactive implants (includes brachytherapy) (1988+)      7096
Refused (1988+)                                            5461
Radiation, NOS  method or source not specified             1435
Combination of beam with implants or isotopes               469
Radioisotopes (1988+)                                       131
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Radiation recode
Beam radiation                                           4116
None/Unknown                                             3420
Radioactive implants (includes brachytherapy) (1988+)     261
Recommended, unknown if administered                      227
Refused (1988+)                                           184
Radiation, NOS  method o

Unnamed: 0,type1,percent1,type2,percent2,p
0,Radioisotopes (1988+),0.037,"Recommended, unknown if administered",0.02,0.295
1,Radioisotopes (1988+),0.037,Radioactive implants (includes brachytherapy) (1988+),0.035,0.878
2,Radioisotopes (1988+),0.037,Combination of beam with implants or isotopes,0.049,0.722
3,Radioisotopes (1988+),0.037,"Radiation, NOS method or source not specified",0.034,0.924
4,Radioisotopes (1988+),0.037,None/Unknown,0.025,0.527
5,Radioisotopes (1988+),0.037,Refused (1988+),0.033,0.979
6,Radioisotopes (1988+),0.037,Beam radiation,0.026,0.602
7,"Recommended, unknown if administered",0.02,Radioactive implants (includes brachytherapy) (1988+),0.035,0.0
8,"Recommended, unknown if administered",0.02,Combination of beam with implants or isotopes,0.049,0.0
9,"Recommended, unknown if administered",0.02,"Radiation, NOS method or source not specified",0.034,0.001


#### 2.3.20 Chemotherapy recode (yes, no/unk)

In [38]:
columninfo('Chemotherapy recode (yes, no/unk)', spdata, mpdata)

spdata: Chemotherapy recode (yes, no/unk)
No/Unknown    184632
Yes           130515
dtype: int64
------------------------------------------------------------------------------------------
mpdata: Chemotherapy recode (yes, no/unk)
No/Unknown    5349
Yes           2939
dtype: int64
------------------------------------------------------------------------------------------
spdata: 2, mpdata: 2
difference:  []
******************************************************************************************
            svalue  mvalue  percent
No/Unknown  184632    5349    0.028
Yes         130515    2939    0.022
p =  0.0
------------------------------------------------------------------------------------------


Unnamed: 0,type1,percent1,type2,percent2,p
0,Yes,0.022,No/Unknown,0.028,0.0


#### 2.3.21

In [44]:
spdata.columns

Index(['Patient ID', 'Race recode (W, B, AI, API)',
       'Race recode (White, Black, Other)', 'Year of diagnosis',
       'Site recode ICD-O-3/WHO 2008', 'Primary Site - labeled',
       'Histologic Type ICD-O-3', 'Grade', 'Laterality',
       'ICD-O-3 Hist/behav, malignant',
       'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'Derived AJCC T, 7th ed (2010-2015)',
       'Derived AJCC N, 7th ed (2010-2015)',
       'Derived AJCC M, 7th ed (2010-2015)',
       'Derived SEER Cmb Stg Grp (2016+)', 'Derived SEER Combined T (2016+)',
       'Derived SEER Combined N (2016+)', 'Derived SEER Combined M (2016+)',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation sequence with surgery',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)',
       'SEER Combined Mets at DX-bone (2010+)',
       'SEER Combined Mets at DX-brain (2010+)',
       'SEER Combined Mets at DX-liver (2010+)',
       'SEER Combined Mets at DX-lung (2010+)', 'Breast Subtype (2010+)',
       'ER Stat

In [45]:
mpdata.columns

Index(['Patient ID', 'Record number recode',
       'Site recode B ICD-O-3/WHO 2008 (Event Variable)', 'Year of diagnosis',
       'Race recode (W, B, AI, API)', 'Site recode ICD-O-3/WHO 2008',
       'Age recode with <1 year olds', 'Age at diagnosis',
       'Primary Site - labeled', 'Histologic Type ICD-O-3', 'Grade',
       'Laterality', 'Derived AJCC Stage Group, 7th ed (2010-2015)',
       'Derived AJCC T, 7th ed (2010-2015)',
       'Derived AJCC N, 7th ed (2010-2015)',
       'Derived AJCC M, 7th ed (2010-2015)',
       'Derived SEER Cmb Stg Grp (2016+)', 'Derived SEER Combined T (2016+)',
       'Derived SEER Combined N (2016+)', 'Derived SEER Combined M (2016+)',
       'RX Summ--Surg Prim Site (1998+)', 'Radiation sequence with surgery',
       'Radiation recode', 'Chemotherapy recode (yes, no/unk)',
       'Tumor Size Summary (2016+)', 'CS tumor size (2004-2015)',
       'SEER Combined Mets at DX-bone (2010+)',
       'SEER Combined Mets at DX-brain (2010+)',
       'SEER Co