### Logistic Regression using SAS

In [2]:
/*Importing csv file*/
Title 'Reading the csv data';
PROC IMPORT DATAFILE = "kyphosis.csv"
OUT= WORK.kyphosis
DBMS=csv
REPLACE;
GETNAMES=YES;
RUN;

PROC PRINT data = work.kyphosis(obs= 5);
Run;

Obs,Kyphosis,Age,Number,Start
1,absent,71,3,5
2,absent,158,3,14
3,present,128,4,5
4,absent,2,5,1
5,absent,1,4,15


In [3]:
Title 'Setting the data';
DATA kyp; 
SET kyphosis;
RUN;

proc print data = kyp(obs=5);
run;

Obs,Kyphosis,Age,Number,Start
1,absent,71,3,5
2,absent,158,3,14
3,present,128,4,5
4,absent,2,5,1
5,absent,1,4,15


In [5]:
Title 'Getting details about data';
PROC CONTENTS DATA = kyp;
RUN;

0,1,2,3
Data Set Name,WORK.KYP,Observations,81
Member Type,DATA,Variables,4
Engine,V9,Indexes,0
Created,09/16/2020 17:44:24,Observation Length,40
Last Modified,09/16/2020 17:44:24,Deleted Observations,0
Protection,,Compressed,NO
Data Set Type,,Sorted,NO
Label,,,
Data Representation,"SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64",,
Encoding,utf-8 Unicode (UTF-8),,

Engine/Host Dependent Information,Engine/Host Dependent Information.1
Data Set Page Size,65536
Number of Data Set Pages,1
First Data Page,1
Max Obs per Page,1632
Obs in First Data Page,81
Number of Data Set Repairs,0
Filename,/tmp/SAS_work47C500000983_localhost.localdomain/kyp.sas7bdat
Release Created,9.0401M6
Host Created,Linux
Inode Number,141520

Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes,Alphabetic List of Variables and Attributes
#,Variable,Type,Len,Format,Informat
2,Age,Num,8,BEST12.,BEST32.
1,Kyphosis,Char,9,$9.,$9.
3,Number,Num,8,BEST12.,BEST32.
4,Start,Num,8,BEST12.,BEST32.


In [6]:
Title 'Getting details about data';
PROC means DATA = kyp;
RUN;

Variable,N,Mean,Std Dev,Minimum,Maximum
Age Number Start,81 81 81,83.6543210 4.0493827 11.4938272,58.1042512 1.6194230 4.8839622,1.0000000 2.0000000 1.0000000,206.0000000 10.0000000 18.0000000


In [7]:
 *create a format to group missing and nonmissing;
* getting missing values for both numeric and character variables;

TITLE 'All MISSING VALUES';
proc format;
 value $missfmt ' '='Missing' other='Not Missing';
 value  missfmt  . ='Missing' other='Not Missing';
run;
 
 
proc freq data=kyp; 
format _CHAR_ $missfmt.;
tables _CHAR_ / missing missprint nocum nopercent;
format _NUMERIC_ missfmt.;
tables _NUMERIC_ / missing missprint nocum nopercent;
run;

Kyphosis,Frequency
Not Missing,81

Age,Frequency
Not Missing,81

Number,Frequency
Not Missing,81

Start,Frequency
Not Missing,81


In [8]:
TITLE 'Counting catagory of kyphosis';
Proc freq data = kyp;
 table kyphosis;
run;

Kyphosis,Frequency,Percent,Cumulative Frequency,Cumulative Percent
absent,64,79.01,64,79.01
present,17,20.99,81,100.0


In [9]:
TITLE 'Replacing absent by 0 and present by 1';
DATA kyp1;
    set kyp;
    IF Kyphosis = "present" THEN Kyphosis=1;  ELSE Kyphosis=0;
RUN;

PROC PRINT DATA=kyp1(obs=5);
RUN;

Obs,Kyphosis,Age,Number,Start
1,0,71,3,5
2,0,158,3,14
3,1,128,4,5
4,0,2,5,1
5,0,1,4,15


In [10]:
*Logistic regression;
Title "Logistic Regression";
proc logistic data = kyp1 desc;
model kyphosis = age number start;
output out = outdata p= pred_prob lower = low upper =upp;
run;
quit;

Model Information,Model Information.1
Data Set,WORK.KYP1
Response Variable,Kyphosis
Number of Response Levels,2
Model,binary logit
Optimization Technique,Fisher's scoring

0,1
Number of Observations Read,81
Number of Observations Used,81

Response Profile,Response Profile,Response Profile
Ordered Value,Kyphosis,Total Frequency
1,1,17
2,0,64

Model Convergence Status
Convergence criterion (GCONV=1E-8) satisfied.

Model Fit Statistics,Model Fit Statistics,Model Fit Statistics
Criterion,Intercept Only,Intercept and Covariates
AIC,85.234,69.38
SC,87.629,78.958
-2 Log L,83.234,61.38

Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0
Test,Chi-Square,DF,Pr > ChiSq
Likelihood Ratio,21.8545,3,<.0001
Score,20.8548,3,0.0001
Wald,14.1123,3,0.0028

Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates
Parameter,DF,Estimate,Standard Error,Wald Chi-Square,Pr > ChiSq
Intercept,1,-2.0369,1.4496,1.9744,0.16
Age,1,0.0109,0.00645,2.8748,0.09
Number,1,0.4106,0.2249,3.334,0.0679
Start,1,-0.2065,0.0677,9.3045,0.0023

Odds Ratio Estimates,Odds Ratio Estimates,Odds Ratio Estimates,Odds Ratio Estimates
Effect,Point Estimate,95% Wald Confidence Limits,95% Wald Confidence Limits.1
Age,1.011,0.998,1.024
Number,1.508,0.97,2.343
Start,0.813,0.712,0.929

Association of Predicted Probabilities and Observed Responses,Association of Predicted Probabilities and Observed Responses.1,Association of Predicted Probabilities and Observed Responses.2,Association of Predicted Probabilities and Observed Responses.3
Percent Concordant,85.9,Somers' D,0.719
Percent Discordant,14.1,Gamma,0.719
Percent Tied,0.0,Tau-a,0.241
Pairs,1088.0,c,0.859


In [11]:
*Logistic regression;
Title "Logistic Regression Model Selection";
proc logistic data = kyp1 desc;
model kyphosis = age number start / selection = forward;
run;
quit;

Model Information,Model Information.1
Data Set,WORK.KYP1
Response Variable,Kyphosis
Number of Response Levels,2
Model,binary logit
Optimization Technique,Fisher's scoring

0,1
Number of Observations Read,81
Number of Observations Used,81

Response Profile,Response Profile,Response Profile
Ordered Value,Kyphosis,Total Frequency
1,1,17
2,0,64

Model Convergence Status
Convergence criterion (GCONV=1E-8) satisfied.

0,1,2
-2 Log L,=,83.234

Residual Chi-Square Test,Residual Chi-Square Test,Residual Chi-Square Test
Chi-Square,DF,Pr > ChiSq
20.8548,3,0.0001

Model Convergence Status
Convergence criterion (GCONV=1E-8) satisfied.

Model Fit Statistics,Model Fit Statistics,Model Fit Statistics
Criterion,Intercept Only,Intercept and Covariates
AIC,85.234,72.072
SC,87.629,76.861
-2 Log L,83.234,68.072

Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0
Test,Chi-Square,DF,Pr > ChiSq
Likelihood Ratio,15.1623,1,<.0001
Score,16.108,1,<.0001
Wald,12.9965,1,0.0003

Residual Chi-Square Test,Residual Chi-Square Test,Residual Chi-Square Test
Chi-Square,DF,Pr > ChiSq
6.0585,2,0.0484

Summary of Forward Selection,Summary of Forward Selection,Summary of Forward Selection,Summary of Forward Selection,Summary of Forward Selection,Summary of Forward Selection
Step,Effect Entered,DF,Number In,Score Chi-Square,Pr > ChiSq
1,Start,1,1,16.108,<.0001

Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates
Parameter,DF,Estimate,Standard Error,Wald Chi-Square,Pr > ChiSq
Intercept,1,0.8901,0.63,1.9963,0.1577
Start,1,-0.2179,0.0604,12.9965,0.0003

Odds Ratio Estimates,Odds Ratio Estimates,Odds Ratio Estimates,Odds Ratio Estimates
Effect,Point Estimate,95% Wald Confidence Limits,95% Wald Confidence Limits.1
Start,0.804,0.714,0.905

Association of Predicted Probabilities and Observed Responses,Association of Predicted Probabilities and Observed Responses.1,Association of Predicted Probabilities and Observed Responses.2,Association of Predicted Probabilities and Observed Responses.3
Percent Concordant,80.6,Somers' D,0.647
Percent Discordant,15.9,Gamma,0.67
Percent Tied,3.5,Tau-a,0.217
Pairs,1088.0,c,0.824


In [12]:
 *Logistic regression;
Title "Logistic Regression Final Model";
proc logistic data = kyp1 desc;
model kyphosis = start;
output out = outdata p= pred_prob lower = low upper =upp;
run;
quit;

Model Information,Model Information.1
Data Set,WORK.KYP1
Response Variable,Kyphosis
Number of Response Levels,2
Model,binary logit
Optimization Technique,Fisher's scoring

0,1
Number of Observations Read,81
Number of Observations Used,81

Response Profile,Response Profile,Response Profile
Ordered Value,Kyphosis,Total Frequency
1,1,17
2,0,64

Model Convergence Status
Convergence criterion (GCONV=1E-8) satisfied.

Model Fit Statistics,Model Fit Statistics,Model Fit Statistics
Criterion,Intercept Only,Intercept and Covariates
AIC,85.234,72.072
SC,87.629,76.861
-2 Log L,83.234,68.072

Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0
Test,Chi-Square,DF,Pr > ChiSq
Likelihood Ratio,15.1623,1,<.0001
Score,16.108,1,<.0001
Wald,12.9965,1,0.0003

Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates
Parameter,DF,Estimate,Standard Error,Wald Chi-Square,Pr > ChiSq
Intercept,1,0.8901,0.63,1.9963,0.1577
Start,1,-0.2179,0.0604,12.9965,0.0003

Odds Ratio Estimates,Odds Ratio Estimates,Odds Ratio Estimates,Odds Ratio Estimates
Effect,Point Estimate,95% Wald Confidence Limits,95% Wald Confidence Limits.1
Start,0.804,0.714,0.905

Association of Predicted Probabilities and Observed Responses,Association of Predicted Probabilities and Observed Responses.1,Association of Predicted Probabilities and Observed Responses.2,Association of Predicted Probabilities and Observed Responses.3
Percent Concordant,80.6,Somers' D,0.647
Percent Discordant,15.9,Gamma,0.67
Percent Tied,3.5,Tau-a,0.217
Pairs,1088.0,c,0.824


In [13]:
* Splitng data into train and test set;
Title 'Spliting Train and Test data';
data kyp2; 
set kyp1; 
n=ranuni(100); 
run; 

proc sort data=kyp2; 
by n;

data training testing; 
set kyp2 nobs=nobs; 
if _n_<= 0.75*nobs then output training; 
else output testing; 
run;

proc print data = training;
run;

proc print data = testing;
run;
quit;


Obs,Kyphosis,Age,Number,Start,n
1,0,158,3,14,0.00887
2,1,42,7,6,0.01087
3,0,100,3,14,0.03647
4,0,97,3,16,0.04708
5,0,87,4,16,0.07076
6,0,1,3,9,0.10424
7,0,35,3,13,0.11049
8,0,130,5,13,0.12136
9,1,121,3,3,0.13179
10,0,178,4,15,0.14349

Obs,Kyphosis,Age,Number,Start,n
1,0,31,3,16,0.83616
2,1,120,5,8,0.8474
3,0,11,3,15,0.84938
4,1,96,3,12,0.85541
5,1,139,3,10,0.87835
6,0,143,9,3,0.87895
7,0,158,5,14,0.89059
8,0,112,3,16,0.912
9,0,1,2,16,0.92773
10,1,82,5,14,0.93186


In [14]:
*Split Train and Test Data set in SAS  –  PROC SURVEYSELECT : Method 2;
proc surveyselect data=kyp1 rat=0.75
out= kyp1_select outall
method=srs; 
run;


0,1
Selection Method,Simple Random Sampling

0,1
Input Data Set,KYP1
Random Number Seed,678627786
Sampling Rate,0.75
Sample Size,61
Selection Probability,0.753086
Sampling Weight,0
Output Data Set,KYP1_SELECT


In [15]:
data kyp1_train kyp1_test; 
set kyp1_select; 
if selected = 1 then output kyp1_train; 
else output kyp1_test; 
run;

proc print data = kyp1_train;
run;

proc print data = kyp1_test;
run;
quit;

Obs,Selected,Kyphosis,Age,Number,Start
1,1,0,71,3,5
2,1,0,158,3,14
3,1,1,128,4,5
4,1,0,2,5,1
5,1,0,1,4,15
6,1,0,1,2,16
7,1,0,37,3,16
8,1,0,113,2,16
9,1,1,82,5,14
10,1,0,18,5,2

Obs,Selected,Kyphosis,Age,Number,Start
1,0,0,61,2,17
2,0,1,59,6,12
3,0,0,148,3,16
4,0,0,1,3,16
5,0,0,78,6,15
6,0,0,131,2,3
7,0,0,31,3,16
8,0,0,140,5,11
9,0,0,136,4,15
10,0,1,121,3,3


In [29]:
Title 'Logistic Reg';
proc logistic data=kyp1_train plots = all;
model Kyphosis(event = '1') = Age Number Start / clodds = pl;
score data = kyp1_test out=mypreds;
run;

Model Information,Model Information.1
Data Set,WORK.KYP1_TRAIN
Response Variable,Kyphosis
Number of Response Levels,2
Model,binary logit
Optimization Technique,Fisher's scoring

0,1
Number of Observations Read,61
Number of Observations Used,61

Response Profile,Response Profile,Response Profile
Ordered Value,Kyphosis,Total Frequency
1,0,48
2,1,13

Model Convergence Status
Convergence criterion (GCONV=1E-8) satisfied.

Model Fit Statistics,Model Fit Statistics,Model Fit Statistics
Criterion,Intercept Only,Intercept and Covariates
AIC,65.203,54.638
SC,67.314,63.082
-2 Log L,63.203,46.638

Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0,Testing Global Null Hypothesis: BETA=0
Test,Chi-Square,DF,Pr > ChiSq
Likelihood Ratio,16.5644,3,0.0009
Score,16.1465,3,0.0011
Wald,10.5474,3,0.0144

Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates,Analysis of Maximum Likelihood Estimates
Parameter,DF,Estimate,Standard Error,Wald Chi-Square,Pr > ChiSq
Intercept,1,-2.4723,1.7052,2.1022,0.1471
Age,1,0.00781,0.00708,1.2184,0.2697
Number,1,0.4859,0.2665,3.3241,0.0683
Start,1,-0.1699,0.0826,4.2355,0.0396

Association of Predicted Probabilities and Observed Responses,Association of Predicted Probabilities and Observed Responses.1,Association of Predicted Probabilities and Observed Responses.2,Association of Predicted Probabilities and Observed Responses.3
Percent Concordant,85.9,Somers' D,0.718
Percent Discordant,14.1,Gamma,0.718
Percent Tied,0.0,Tau-a,0.245
Pairs,624.0,c,0.859

Odds Ratio Estimates and Profile-Likelihood Confidence Intervals,Odds Ratio Estimates and Profile-Likelihood Confidence Intervals,Odds Ratio Estimates and Profile-Likelihood Confidence Intervals,Odds Ratio Estimates and Profile-Likelihood Confidence Intervals,Odds Ratio Estimates and Profile-Likelihood Confidence Intervals
Effect,Unit,Estimate,95% Confidence Limits,95% Confidence Limits.1
Age,1.0,1.008,0.994,1.023
Number,1.0,1.626,1.008,2.921
Start,1.0,0.844,0.709,0.987


In [22]:
proc print data =mypreds;
run;

Obs,Selected,Kyphosis,Age,Number,Start,F_Kyphosis,I_Kyphosis,P_1,P_0
1,0,0,61,2,17,0,0,0.01959,0.98041
2,0,1,59,6,12,1,0,0.24316,0.75684
3,0,0,148,3,16,0,0,0.07059,0.92941
4,0,0,1,3,16,0,0,0.02353,0.97647
5,0,0,78,6,15,0,0,0.1829,0.8171
6,0,0,131,2,3,0,0,0.27145,0.72855
7,0,0,31,3,16,0,0,0.02956,0.97044
8,0,0,140,5,11,0,0,0.30602,0.69398
9,0,0,136,4,15,0,0,0.11757,0.88243
10,0,1,121,3,3,1,0,0.35905,0.64095
