## Smoking Cessation Study

In [1]:
import numpy as np                                      #standard imports
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import savReaderWriter as srw                           #Python package for reading SPSS .sav files
import seaborn as sb

### Path to SPSS .sav file

In [2]:
savFileName = '../SEMIII_Lapse_CW_1.sav'

### Read data dictionary from .sav file and list keys

In [3]:
with srw.SavHeaderReader(savFileName) as header:       #Read the .sav file metadata
    dd = header.dataDictionary()                       #get a list of the metadata elements
dd.keys()

dict_keys(['fileAttributes', 'varTypes', 'varAttributes', 'alignments', 'columnWidths', 'varSets', 'varRoles', 'varNames', 'caseWeightVar', 'fileLabel', 'formats', 'multRespDefs', 'varLabels', 'valueLabels', 'measureLevels', 'missingValues'])

In [4]:
dd['varLabels']

{b'Attempts': b'',
 b'CallDate': b'',
 b'CallDayNumber': b'Call Day Number',
 b'CallResult': b'',
 b'CallResultTypeID': b'',
 b'CallType': b'Call Type',
 b'CreationDate': b'',
 b'Note': b'',
 b'Nps1': b'Percentile Group of ps1',
 b'Nps2': b'Percentile Group of ps2',
 b'Nps3': b'Percentile Group of ps3',
 b'Npsall': b'Percentile Group of psall',
 b'Terminated_total': b'% of all calls- terminated',
 b'V100': b'79.some other activity?',
 b'V101': b'80.at home?',
 b'V102': b'81.at work?',
 b'V103': b"82.at someone else's home?",
 b'V104': b'83.a bar or restaurant?',
 b'V105': b'84.in a car?',
 b'V106': b'85.outside?',
 b'V107': b'86.at the hospital?',
 b'V108': b'87.some other location?',
 b'V109': b'89.Do you think you smoked because you were coping with stress or some other negative emotion?',
 b'V110': b'90.Today, I had problems with my health.',
 b'V111': b'91.Today, I felt like I had a serious illness.',
 b'V112': b'92.Today, I was afraid when I thought about my health.',
 b'V113': b'

### Make a list of the column names and initialize a dictionary to hold the data

In [5]:
columns=[]                                   #list for column names
datad={}
               
for name in dd['varNames']:                  #varNames are bytes
    colname = name.decode()                  #convert to string
    columns.append(colname)                  #add to list
    datad[colname] = []                      #initialize dictionary for data with empty lists

In [6]:
datad.keys()

dict_keys(['V113', 'chartreview_a1_005', 'V43', 'V41', 'V114', 'chartreview_e1_005', 'V82', 'Attempts', 'V32', 'Nps2', 'V62', 'baseline_c1_004', 'V108', 'V81', 'V76', 'baseline_d1_001', 'intervention', 'baseline_b1_004', 'V80', 'Nps3', 'chartreview_e1_004', 'V60', 'V64', 'V89', 'V96', 'V75', 'chartreview_e1_006a', 'V97', 'V72', 'Terminated_total', 'V44', 'V34', 'V116', 'chartreview_a1_001', 'V84', 'V100', 'V49', 'baseline_b1_003_reversecode', 'V87', 'V109', 'baseline_d1_003_reversecode', 'V88', 'V52', 'ps2', 'V93', 'baseline_c1_003_reversecode', 'V103', 'V58', 'V53', 'V95', 'firstcigarettesurvey', 'ps3', 'V83', 'baseline_b1_002', 'CallResultTypeID', 'V57', 'CallResult', 'chartreview_a1_007', 'V74', 'psall', 'firstcigarettereported', 'V24', 'Npsall', 'baseline_c1_002', 'V69', 'V65', 'V42', 'CallDate', 'V78', 'V59', 'V70', 'chartreview_e1_005a', 'V73', 'V30', 'V40', 'subject_id', 'V94', 'V31', 'chartreview_e1_006', 'baseline_b1_001', 'V102', 'V45', 'baseline_c1_003', 'chartreview_d1_001'

### Read the data and fill in the dictionary

In [7]:
with srw.SavReader(savFileName) as reader:      #use the savReaderWriter package to read the .sav contents

    for line in reader:                         #loop through the cases
        for i in range(len(columns)):           #store each data value in the data dictionary
            value = line[i]
            datad[columns[i]].append(value)

### Create a data frame from the dictionary

In [8]:
scdf = pd.DataFrame(datad,columns=columns)      #create a pandas data frame from the data dictionary
scdf.head()                                     #show the first 5 rows

Unnamed: 0,baseline_isgnumber,isg_no,CreationDate,CallDate,Note,Terminated_total,firstcigarettereported,firstcigarettesurvey,CallDayNumber,CallType,...,ps2,ps3,psall,Npsall,Nps1,Nps2,Nps3,intervention,filter_$,group_dispositions_tx
0,10664.0,,,,b'no lapse',b'no',2.0,,86.0,b'',...,4.0,4.0,3.333333,2.0,1.0,2.0,3.0,1.0,0.0,2.0
1,10685.0,,,,b'no lapse',b'no',2.0,,86.0,b'',...,4.75,3.25,4.333333,3.0,3.0,3.0,2.0,0.0,0.0,1.0
2,10818.0,,,,b'no lapse',b'no',2.0,,86.0,b'',...,4.5,3.5,3.25,2.0,1.0,3.0,2.0,2.0,0.0,2.0
3,10839.0,,,,b'no lapse',b'no',2.0,,86.0,b'',...,3.75,4.25,3.916667,2.0,2.0,2.0,3.0,1.0,0.0,2.0
4,10937.0,,,,b'no lapse',b'no',2.0,,86.0,b'',...,4.0,3.5,3.5,2.0,1.0,2.0,2.0,0.0,0.0,1.0


In [9]:
scdf.dtypes

baseline_isgnumber             float64
isg_no                         float64
CreationDate                    object
CallDate                        object
Note                            object
Terminated_total                object
firstcigarettereported         float64
firstcigarettesurvey           float64
CallDayNumber                  float64
CallType                        object
CallResultTypeID               float64
CallResult                      object
recodedcallresult              float64
Attempts                       float64
V24                            float64
V25                            float64
V26                            float64
V27                            float64
V28                            float64
V29                            float64
V30                            float64
V31                            float64
V32                            float64
V33                            float64
V34                            float64
V35                      

In [10]:
print("Columns: {}".format(scdf.columns))

Columns: Index(['baseline_isgnumber', 'isg_no', 'CreationDate', 'CallDate', 'Note',
       'Terminated_total', 'firstcigarettereported', 'firstcigarettesurvey',
       'CallDayNumber', 'CallType',
       ...
       'ps2', 'ps3', 'psall', 'Npsall', 'Nps1', 'Nps2', 'Nps3', 'intervention',
       'filter_$', 'group_dispositions_tx'],
      dtype='object', length=148)


## Counts by note: non-null CreationDate (335 subjects)

In [11]:
scdf[scdf.CreationDate.notnull()].groupby('Note').count()

Unnamed: 0_level_0,baseline_isgnumber,isg_no,CreationDate,CallDate,Terminated_total,firstcigarettereported,firstcigarettesurvey,CallDayNumber,CallType,CallResultTypeID,...,ps2,ps3,psall,Npsall,Nps1,Nps2,Nps3,intervention,filter_$,group_dispositions_tx
Note,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b'first cigarette reported',323,323,323,323,323,323,323,323,323,323,...,319,320,320,320,320,319,320,323,320,323
b'first cigarette reported but survey not given',12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12


## Counts by note for null CreationDate  (101 subjects?)

In [12]:
scdf[scdf.CreationDate.isnull()].groupby('Note').count()
    

Unnamed: 0_level_0,baseline_isgnumber,isg_no,CreationDate,CallDate,Terminated_total,firstcigarettereported,firstcigarettesurvey,CallDayNumber,CallType,CallResultTypeID,...,ps2,ps3,psall,Npsall,Nps1,Nps2,Nps3,intervention,filter_$,group_dispositions_tx
Note,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b'',2,0,0,0,2,2,0,0,2,0,...,2,2,2,2,0,0,0,2,2,2
b'no lapse',39,0,0,0,39,39,0,39,39,0,...,39,39,39,39,39,39,39,39,39,39
b'unenrolled no lapse',1,0,0,0,1,1,0,0,1,0,...,1,1,1,1,0,0,0,1,1,1
"b'unenrolled, no lapse'",58,0,0,0,58,58,0,1,58,0,...,55,55,56,56,0,0,0,58,58,58
"b'unerolled, no lapse'",1,0,0,0,1,1,0,1,1,0,...,1,1,1,1,0,0,0,1,1,1


## Questionable Notes 

What does baseline_isgnumber=99999.0 mean?

In [13]:
scdf[(scdf.Note==b'unenrolled no lapse') | (scdf.Note==b'unerolled, no lapse') | (scdf.Note==b'')][['Note','baseline_isgnumber']]

Unnamed: 0,Note,baseline_isgnumber
374,b'',99999.0
375,b'',99999.0
376,b'unenrolled no lapse',11126.0
435,"b'unerolled, no lapse'",10727.0


## Counts by CallDayNumber

In [14]:
scdf[scdf.CreationDate.notnull()].groupby('CallDayNumber').count()

Unnamed: 0_level_0,baseline_isgnumber,isg_no,CreationDate,CallDate,Note,Terminated_total,firstcigarettereported,firstcigarettesurvey,CallType,CallResultTypeID,...,ps2,ps3,psall,Npsall,Nps1,Nps2,Nps3,intervention,filter_$,group_dispositions_tx
CallDayNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,119,119,119,119,119,119,119,119,119,119,...,118,118,118,118,118,118,118,119,118,119
2.0,83,83,83,83,83,83,83,83,83,83,...,81,82,82,82,82,81,82,83,82,83
3.0,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,36,36,36,36,36
4.0,16,16,16,16,16,16,16,16,16,16,...,15,15,15,15,15,15,15,16,15,16
5.0,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
6.0,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
7.0,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
8.0,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
9.0,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
10.0,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3


In [15]:
scdf.groupby('firstcigarettereported').count()

Unnamed: 0_level_0,baseline_isgnumber,isg_no,CreationDate,CallDate,Note,Terminated_total,firstcigarettesurvey,CallDayNumber,CallType,CallResultTypeID,...,ps2,ps3,psall,Npsall,Nps1,Nps2,Nps3,intervention,filter_$,group_dispositions_tx
firstcigarettereported,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,62,0,0,0,62,62,0,2,62,0,...,59,59,60,60,0,0,0,62,62,62
1.0,335,335,335,335,335,335,335,335,335,335,...,331,332,332,332,332,331,332,335,332,335
2.0,39,0,0,0,39,39,0,39,39,0,...,39,39,39,39,39,39,39,39,39,39


In [16]:
type(scdf.CallDate)

pandas.core.series.Series

In [17]:
type(scdf.CallDate[1])

NoneType

In [18]:
scdf.dtypes

baseline_isgnumber             float64
isg_no                         float64
CreationDate                    object
CallDate                        object
Note                            object
Terminated_total                object
firstcigarettereported         float64
firstcigarettesurvey           float64
CallDayNumber                  float64
CallType                        object
CallResultTypeID               float64
CallResult                      object
recodedcallresult              float64
Attempts                       float64
V24                            float64
V25                            float64
V26                            float64
V27                            float64
V28                            float64
V29                            float64
V30                            float64
V31                            float64
V32                            float64
V33                            float64
V34                            float64
V35                      

In [19]:
crdf = scdf[['CallDayNumber','firstcigarettereported','baseline_b1_001','baseline_c1_001','baseline_d1_001',
      'baseline_b1_002','baseline_c1_002','baseline_d1_002',
      'baseline_b1_003','baseline_c1_003','baseline_d1_003',
      'baseline_b1_004','baseline_c1_004','baseline_d1_004','']]

In [20]:
crdf.to_csv("../savrw2_survival_data.csv")

In [21]:
scdf[['baseline_c1_001']]

Unnamed: 0,baseline_c1_001
0,4.0
1,5.0
2,5.0
3,4.0
4,4.0
5,4.0
6,3.0
7,4.0
8,5.0
9,5.0
