In [None]:
'''                                         cell 0
Purpose: I am given a set of data pertaining to a survey given to users who chose to download a package. I am assigned a 
         certain response variable i.e. a certain response to one of the survey questions. My tasked is to modeling the 
         response. The variable I am given is to Question 5. Question 5-4 asks how important were the given factors in 
         choosing to the package. The response variable I am to model is how important was visible growth in popularity
         (PG5_4VGP) in choosing this package. The possible responses are:
                                                           0 not a priority
                                                           1 low priority
                                                           2 medium priority
                                                           3 high priority
                                                           4 essential

Work Flow:
    1) My assumptions:
        a) there is some relation between the response of the survey question 5-4 and the meta and para data related to this 
           question i.e:
                        -total survey time
                        -time to answer Quesion 5
                        -Time from when Question 5 response 4 was answered
                        -Time from when Question 5 was started til response 4 was submitted
                        -Order number in which response was submitted
        b) Surveys that are not complete are can be discarded
        c) Surveys lacking related responses can be discarded
    2) Problem to be solved:
        a) Predicting how important a visible growth in popularity was in the person choosing the package

'''

In [561]:
get_predictions <- function(predictions, res_l){
    ret_vec = c()
    
    for(i in 1:nrow(predictions)){
        
        ro <- predictions[i,]
        mx <-max(ro)
        ret_vec[length(ret_vec)+1 ] <- res[ match(mx, ro)] 
    }
    
    return (ret_vec)
}

In [602]:
make_data_frame <-function(df, dim_nm, l){
    
    ret_df <-data.frame(df[,1])
    dimnames(ret_df)[[2]]<-c(dim_nm[1])
    
    
    for(i in 2:length(l)){
        
        ret_df[dim_nm[i]]<-df[,l[i]]
    }
    
    return(ret_df)
}

In [362]:
#                                     Cell 1 
#                                  function cell

#converts a given date to unix seconds
date_to_unix_sec <- function(val){
    
    num = as.numeric(as.POSIXct(strptime(val, "%Y-%m-%d %H:%M:%S")))
    
    return(num)
}


#Given a set of start times and a set of time stamps to go to will return the total seconds between the two
# values should already be converted to seconds
# makes sure the values are numeric in the return list
seconds_since_start<-function(start, time_stamp)
{
    dif_l = c()
    
    for(i in 1:length(time_stamp) ){
        
        dif_l[i] <- as.numeric(time_stamp[i]) - as.numeric(start[i])
    }
    
    return(dif_l)
}

make_ignore_list <- function(array_l){
    
    ignore_l = list()
    
    idx = 1
    for(entry in array_l){
        
        if(entry == '' | is.na(entry)){
            
            ignore_l[length(ignore_l)+1] = as.numeric(idx)
        }
        idx <- idx + 1
    }
    #print(length(ignore_l))
    return(ignore_l)
    
}

# used to grab the data from a given data frame column and return only valid values.
# Uses the ignore list to ignore a given set of indexs in the column
# also ignores empty and NA responses
# TODO: should make this add to the ignore_l list as it goes
# has a set of string responses to look for as well
grab_dataC <-function(d_col, ignore_l){
    
    ret_list <- c()
    actual_l <-list()
    
    
    #                        1                2               3                   4                 5
    valid_resp <- list('Not a Priority', 'Low Priority', 'Medium Priority', 'High Priority',   'Essential')
    
    #  age           1          2          3          4          5           6 
    a_opt = list("18 - 24", "25 - 34", "35 - 44", "45 - 54", "55 - 64", "65 and over" )
    
    # 1st commit with package options
    b_opt = list('Yes', 'No', 'Not sure')
    
    # # of projects work on options
    #               1        2        3          4          5           6        
    e_opt = list('2 - 3', '4 - 6', '7 - 10', '11 - 15', '16 - 25', 'More than 25')
    
    # Experience level options
    #                    1                2               3              4               5               6
    f_opt = list('Less than 2 years', '2 - 5 years', '6 - 8 years', '9 - 12 years', '13 - 19 years', '20 years or more')
    
    
    # go through entries in given data column
    for(i in 1:length(d_col)){
        #if index i is not in the ignore list
        if(!(i %in% ignore_l)){
            item = d_col[i]
            #print(item)
            #if the entry is not empty or non applicable(NA)
            if(item != '' & !is.na(item)){    
               if((item %in% valid_resp)){
                    #print('found a response variable')
                    val = match(item,valid_resp)
                    if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                        #print('option resp')
                        ret_list[length(ret_list)+1] = as.character(item)
                    }
                }else if(item %in% a_opt){
                   #val = match(item,a_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                        #print('option a')
                        ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(item %in% f_opt){
                   #val = match(item,f_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                       #print('option f') 
                       ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(item %in% e_opt){
                   #val = match(item,e_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                       #print('option e') 
                       ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(item %in% b_opt){
                   #val = match(item,b_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                       #print('option b') 
                       ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(is.numeric(item)){
                    ret_list[length(ret_list)+1] = as.numeric(item)
                }else{
                    #secs = date_to_unix_sec(item)
                    #ret_list[length(ret_list)+1] = item
                }
            }else{
                  
                #ignore_l[length(ignore_l)+1] = i
            }
            
        }
    }
    
    
    
    #ret_vect = list(ret_list, ignore_l)
    
    #return(ret_vect)
    return(ret_list)
}


# used to grab the data from a given data frame column and return only valid values.
# Uses the ignore list to ignore a given set of indexs in the column
# also ignores empty and NA responses
# TODO: should make this add to the ignore_l list as it goes
# has a set of string responses to look for as well
grab_dataL <-function(d_col, ignore_l){
    
    ret_list <- list()
    actual_l <-list()
    
    
    #                        1                2               3                   4                 5
    valid_resp <- list('Not a Priority', 'Low Priority', 'Medium Priority', 'High Priority',   'Essential')
    
    #  age           1          2          3          4          5           6 
    a_opt = list("18 - 24", "25 - 34", "35 - 44", "45 - 54", "55 - 64", "65 and over" )
    
    # 1st commit with package options
    b_opt = list('Yes', 'No', 'Not sure')
    
    # # of projects work on options
    #               1        2        3          4          5           6        
    e_opt = list('2 - 3', '4 - 6', '7 - 10', '11 - 15', '16 - 25', 'More than 25')
    
    # Experience level options
    #                    1                2               3              4               5               6
    f_opt = list('Less than 2 years', '2 - 5 years', '6 - 8 years', '9 - 12 years', '13 - 19 years', '20 years or more')
    
    
    # go through entries in given data column
    for(i in 1:length(d_col)){
        #if index i is not in the ignore list
        if(!(i %in% ignore_l)){
            item = d_col[i]
            #print(item)
            #if the entry is not empty or non applicable(NA)
            if(item != '' & !is.na(item)){    
               if((item %in% valid_resp)){
                    #print('found a response variable')
                    val = match(item,valid_resp)
                    if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                        #print('option resp')
                        ret_list[length(ret_list)+1] = as.character(item)
                    }
                }else if(item %in% a_opt){
                   #val = match(item,a_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                        #print('option a')
                        ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(item %in% f_opt){
                   #val = match(item,f_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                       #print('option f') 
                       ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(item %in% e_opt){
                   #val = match(item,e_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                       #print('option e') 
                       ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(item %in% b_opt){
                   #val = match(item,b_opt)
                    #if(!is.na(val)){
                        #ret_list[length(ret_list)+1] = val
                       #print('option b') 
                       ret_list[length(ret_list)+1] = as.character(item)
                    #}
                }else if(is.numeric(item)){
                    ret_list[length(ret_list)+1] = as.numeric(item)
                }else{
                    #secs = date_to_unix_sec(item)
                    #ret_list[length(ret_list)+1] = item
                }
            }else{
                  
                #ignore_l[length(ignore_l)+1] = i
            }
            
        }
    }
    
    return(ret_list)
}



In [363]:
#                                          cell 1
# like in the example for pretty printing?
# sets the number of decimal places to display to two
options(digits = 9)

In [364]:
#                                          cell 2
# work flow: Data Extraction

data <- read.csv('TechSurvey - Survey.csv', header = T)

#grab the names of collumns for data exploration
col_names <-colnames(data)

#print(data$Start)

orig_d_size = 1353

#convert date to unix second
# start and end times
# then do submission times
for (i in c("Start", "End")) 
    data[,i] = as.numeric(as.POSIXct(strptime(data[,i], "%Y-%m-%d %H:%M:%S")))
for (i in 0:12){
    vnam = paste(c("PG",i,"Submit"), collapse="")
    #t_subs[i+1] = as.numeric(as.POSIXct(strptime(data[,vnam], "%Y-%m-%d %H:%M:%S")))
    data[,vnam] = as.numeric(as.POSIXct(strptime(data[,vnam], "%Y-%m-%d %H:%M:%S"))) 
}

resp_names <- list()
print(data$PG4Submit)

#print(data$PG2Resp)


#convert responses to q5 to seconds
for (i in 1:13){
    vnam = paste(c("PG5_",i,"Time"), collapse="")
    resp_names[i] = vnam
    #t_subs[i+1] = as.numeric(as.POSIXct(strptime(data[,vnam], "%Y-%m-%d %H:%M:%S")))
    data[,vnam] = as.numeric(as.POSIXct(strptime(data[,vnam], "%Y-%m-%d %H:%M:%S"))) 
}






   [1]         NA         NA 1539263572         NA 1539263663         NA
   [7] 1539263644         NA 1539263635         NA         NA         NA
  [13] 1539263735 1539263783 1539264751         NA         NA 1539263749
  [19]         NA         NA 1539263836 1539263773         NA         NA
  [25] 1539263773         NA 1539263930 1539264607 1539263980 1539263813
  [31]         NA 1539263864 1539264528 1539265208         NA 1539263813
  [37]         NA 1539263865 1539263838 1539263876         NA 1539263837
  [43] 1539263904 1539263906 1539263903 1539263882         NA         NA
  [49] 1539264007 1539263951         NA 1539264059 1539264170 1539264051
  [55] 1539264875 1539264057 1539264069 1539264173 1539264120         NA
  [61] 1539264211 1539264117         NA 1539264124 1539264136 1539264301
  [67] 1539264326         NA         NA 1539264183 1539264306 1539264292
  [73]         NA 1539264306 1539264273         NA         NA         NA
  [79] 1539264424 1539264422 1539264435         NA 

In [365]:
#print(data$PG2Resp)
#print(data$PG6Resp)
#print(data$PG9Resp)
#print(data$PG12Resp)
print(data$PG5_4Time)

   [1]         NA         NA         NA         NA 1539263696         NA
   [7] 1539263667         NA 1539263677         NA         NA         NA
  [13]         NA 1539263810         NA         NA         NA 1539263790
  [19]         NA         NA 1539263856 1539263810         NA         NA
  [25]         NA         NA 1539264013 1539264673 1539264098 1539263858
  [31]         NA 1539263910 1539264589         NA         NA 1539263854
  [37]         NA         NA 1539263876 1539263931         NA 1539263866
  [43] 1539263953 1539263985 1539263945 1539263908         NA         NA
  [49]         NA         NA         NA 1539264083 1539264226 1539264087
  [55] 1539264917         NA 1539264101 1539264245 1539264149         NA
  [61]         NA         NA         NA         NA 1539264164         NA
  [67]         NA         NA         NA         NA 1539264390 1539264330
  [73]         NA         NA         NA         NA         NA         NA
  [79]         NA         NA 1539264490         NA 

In [366]:
#make a bunch of ignore lists and merge them

#response variable
PG5_4VGP_ign = make_ignore_list(data$PG5_4VGP)  #what response it got


#para data
PG5_4order_ign = make_ignore_list(data$PG5_4Order) # order in which submission was chosen
ignore_lT5_4 =make_ignore_list(data$PG5_4Time) # time response was submitted 
ign_5Sub <- make_ignore_list(data$PG5Submit)   # time q5 was submitted
ign_4Sub <- make_ignore_list(data$PG4Submit)   # time q4 was submitted
start_ign <- make_ignore_list(data$Start)
end_ign <- make_ignore_list(data$End)

#meta data
PG2Resp_ign <- make_ignore_list(data$PG2Resp)
PG6Resp_ign <- make_ignore_list(data$PG6Resp)
PG9Resp_ign <- make_ignore_list(data$PG9Resp)
PG12Resp_ign <- make_ignore_list(data$PG12Resp)


# now create a big ignore list to remove uneeded data

vgp_ignore <- merge(PG5_4VGP_ign, PG5_4order_ign)
vgp_ignore <- merge(vgp_ignore, ignore_lT5_4)
vgp_ignore <- merge(vgp_ignore, ign_5Sub)
vgp_ignore <- merge(vgp_ignore, ign_4Sub)
vgp_ignore <- merge(vgp_ignore, start_ign)
vgp_ignore <- merge(vgp_ignore, end_ign)
vgp_ignore <- merge(vgp_ignore, PG2Resp_ign)
vgp_ignore <- merge(vgp_ignore,PG6Resp_ign)
vgp_ignore <- merge(vgp_ignore, PG12Resp_ign)
vgp_ignore <- merge(vgp_ignore, PG9Resp_ign)

big_ignore <- unique(vgp_ignore)
print('Length of big ignore is:')
print(length(big_ignore))
print('Original data length is:')
print(orig_d_size)
print('Should end up with data size of:')
print(1353L - 912L)
#print(sort(big_ignore))





[1] "Length of big ignore is:"
[1] 912
[1] "Original data length is:"
[1] 1353
[1] "Should end up with data size of:"
[1] 441


In [367]:
# using the created ignore list grab sets of cleaned up data
respTime5_4 <- grab_dataC(data$PG5_4Time, big_ignore) 
#print(data$PG4Submit)
print(respTime5_4)
PG5sub <- grab_dataC(data$PG5Submit, big_ignore)
PG4sub <- grab_dataC(data$PG4Submit, big_ignore)
#print(length(PG5sub))
print(PG5sub)
#SubTime <- grab_data(data$PG5Submit, list())

TR54S5 <- seconds_since_start(respTime5_4, PG5sub)
#print(TRS)
TS4R54 <- seconds_since_start(PG4sub, respTime5_4)
#print(TS4R54)

#calculate differences in time    
for (i in 12:0){
    pv = paste(c("PG",i-1,"Submit"), collapse="");
    if (i==0) 
        pv="Start";
    vnam = paste(c("PG",i,"Submit"), collapse="");
    data[,vnam] = data[,vnam] -data[,pv];
}



start_t <- grab_dataC(data$Start, big_ignore)
end_t <- grab_dataC(data$End, big_ignore)

PG54O <- grab_dataC(data$PG5_4Order, big_ignore)
PG54R <- grab_dataC(data$PG5_4VGP, big_ignore)
PG2R<- grab_dataC(data$PG2Resp, big_ignore)
PG6R<- grab_dataC(data$PG6Resp, big_ignore)
PG9R<- grab_dataC(data$PG9Resp, big_ignore)
PG12R<- grab_dataC(data$PG12Resp, big_ignore)
#<- grab_data(data$, big_ignore)
#<- grab_data(data$, big_ignore)
print(PG54R[1])

TT <- seconds_since_start(start_t, end_t)

TPG5S <- grab_dataC(data$PG5Submit, big_ignore)


  [1] 1539263696 1539263667 1539263677 1539263810 1539263790 1539263856
  [7] 1539263810 1539264013 1539264673 1539264098 1539263858 1539263910
 [13] 1539264589 1539263854 1539263876 1539263931 1539263866 1539263953
 [19] 1539263985 1539263945 1539264083 1539264226 1539264087 1539264917
 [25] 1539264101 1539264245 1539264149 1539264164 1539264390 1539264330
 [31] 1539264490 1539264428 1539264518 1539264448 1539264477 1539264516
 [37] 1539264548 1539265059 1539264609 1539264585 1539264596 1539264591
 [43] 1539264603 1539264582 1539264657 1539264591 1539264601 1539275636
 [49] 1539264614 1539265004 1539264726 1539264760 1539264775 1539264781
 [55] 1539264972 1539264809 1539264803 1539264871 1539264913 1539264911
 [61] 1539264874 1539264903 1539264922 1539264963 1539265002 1539264941
 [67] 1539265091 1539265103 1539264936 1539265059 1539265065 1539265175
 [73] 1539265137 1539265091 1539265092 1539265399 1539265100 1539265123
 [79] 1539265168 1539265193 1539265199 1539265248 1539265327 153

[1] "High Priority"


In [368]:
# my predictor values

# work flow:
#    My hypotheses:
'
            H1: The meta and para data associated with my response variable relates to the resulting response i.e. 
                1-order of response (in what order was this choice picked) PG54O
                2-time of response (how long did it take to respond with this response ) PG54R
                3-submit question time(how long it took to complete this question) PG54R
            H2: the overall time the user took to complete the survery is relavent TT
            H3: The time it took to get to this question is relevant 
            H3: how much time it took to finish the survey after submitting question 5
            H4: the response to was the the first commit with this package speaks to the intent of user
            H5: the Experience level speaks to the inent of the user
            H6: The age of the user speaks to how visible they want to be
            H7: how many projects the user has worked on speaks to their experience with speaks to thier intent
'

# My predictors:
'
    __type____|___Name_____|________________________________Description______________________________________         
    response  | PG54R      | how important was visible growth in popularity in choosing the package
              |            | (1-5) (Not a Priority - Essential)
    __________|____________|_________________________________________________________________________________
              | TS4R54     | the time it took to choose the response from the survey (PG5_4Time - PG4Submit)
      Para    | TPG5S      | the time it took(TT) for question 5 to be submitted (PG5submit - PG4Submit )
      Data    | TR54S5     | the time it took from when choice was made til question 5 was subed(PG5_4Time)
              | TT         | the time the survey took to complete(End - Start)  
              | PG54O      | The order which the response was submitted 
   ___________|____________|_________________________________________________________________________________
              |   PG2R     | responses to question 2(1st commit with package?)
     Meta     |   PG6R     | responses to question 6(Experience level)
     Data     |   PG9R     | responses to question 9(number of previous projects)
              |   PG12R    | responses to question 12(age of user)
   ___________|____________|__________________________________________________________________________________           

''

ERROR: Error in parse(text = x, srcfile = src): <text>:20:1: unexpected INCOMPLETE_STRING
36: 
37: 
    ^


In [None]:
# Model Selection
# I feel this is a classification problem
# I need to look at given predictors and get classify a response
# classifications are: 1-Not a Priority
#                      2-Low Priority
#                      3-Medium Priority
#                      5-High Priority
#                      6-Essential

In [570]:
# lets look at my predictors and response arrays

#response
#print(PG54R)

#time between question start and response chosen
#print(TS4R54)

#time it took to do question 5
#print(TPG5S)

#time between chosing respons and finishing question 5
#print(TR54S5)

#Total time it took to do the quiz
#print(TT)

# order in which response 5-4 was chosen
#print(PG54O)

#responses to question 12 age of user
#print(PG12R)

#responses to question 9 number of projects completed
#print(PG9R)

#responses to question 6 Experience level
#print(PG6R)

#responses to question 2 1st commit with this package
#print(PG2R)



In [257]:
441*.76
336/441
441 - 360
81/9

In [378]:
#options(stringsAsFactors = TRUE)

# make a data frame of my cleaned data
#train_df <- data.frame(cbind(PG2R[1:360], PG6R[1:360], PG9R[1:360], PG12R[1:360], PG54O[1:360],
#                          TT[1:360], TR54S5[1:360],TPG5S[1:360], TS4R54[1:360], PG54R[1:360]))

my_df <- data.frame(cbind(PG2R, PG6R, PG9R, PG12R, PG54O,
                          TT, TR54S5,TPG5S, TS4R54, PG54R))

print(length(my_df))
dimnames(my_df)[[2]] <- c('PG2R','PG6R','PG9R','PG12R','PG54O','TT','TR54S5','TPG5S', 'TS4R54','VGPR'  ) 
print(my_df)

"number of rows of result is not a multiple of vector length (arg 3)"

[1] 10
        PG2R              PG6R         PG9R       PG12R PG54O    TT TR54S5
1         No Less than 2 years        4 - 6     55 - 64     4   270     48
2        Yes     13 - 19 years      11 - 15     45 - 54     3   149     30
3         No       2 - 5 years      11 - 15     35 - 44     9   132     13
4   Not sure       6 - 8 years        4 - 6     18 - 24     8   185      9
5         No       2 - 5 years        4 - 6     35 - 44     4   178     27
6        Yes       6 - 8 years        4 - 6     35 - 44     2   237     42
7         No       6 - 8 years        4 - 6     25 - 34     2   175     24
8        Yes Less than 2 years        2 - 3     18 - 24    13   374      7
9        Yes       2 - 5 years       7 - 10     35 - 44     8  1119     49
10        No       2 - 5 years More than 25     25 - 34    13   419      2
11  Not sure Less than 2 years        4 - 6     35 - 44     4   221     39
12  Not sure       2 - 5 years        4 - 6     25 - 34     3   290     72
13        No      

In [655]:
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################

In [705]:
# create randomized training and test sets

#set.seed(13)

id <- sample(2, nrow(my_df),  prob= c(.7, .3), replace = T)

d_train = my_df[id==1,]
d_test = my_df[id==2,]
#print(d_train)
#nrow(d_train)
#nrow(d_test)

In [706]:
#d_train$PGR <- as.d_train$PGR

d_train$TT <- as.numeric(d_train$TT)
d_train$TR54S5 <- as.numeric(d_train$TR54S5)
d_train$TPG5S <- as.numeric(d_train$TPG5S)
d_train$TS4R54 <- as.numeric(d_train$TS4R54)



#str(d_train)

In [707]:
d_test$TT <- as.numeric(d_test$TT)
d_test$TR54S5 <- as.numeric(d_test$TR54S5)
d_test$TPG5S <- as.numeric(d_test$TPG5S)
d_test$TS4R54 <- as.numeric(d_test$TS4R54)


#str(d_test)

#colnames(my_df)

In [708]:
#Naive Bayes

library(e1071)
library(caret)
library(lattice)
library(ggplot2)

#          all predictors (1-9)
#vgp_nb <- naiveBayes(VGPR ~ ., data = d_train, type = "class")                              #.211,.212,.284,.295,.26

#          all response based (1-5)
#vgp_nb <- naiveBayes(VGPR ~ PG2R + PG6R + PG9R + PG12R + PG54O, data = d_train, type = "class") #.256,.252,.275,.271,.23

#       all time based (6-9)
#vgp_nb <- naiveBayes(VGPR ~ TT + TR54S5 + TPG5S + TS4R54, data = d_train, type = "class")    #.33,.328,.295,.312,.301

#                      (1-2)
vgp_nb <- naiveBayes(VGPR ~ PG2R+PG6R , data = d_train, type = "class")                       #.259,339,
#vgp_nb <- naiveBayes(VGPR ~ PG6R, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG9R, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG12, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG54O, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ TT, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ TR54S5, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ TPG5S, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ TS4R54, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG2R + PG6R + PG9R, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG6R + PG9R, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG2R + PG9R, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG2R + PG12R, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG2R + PG54O, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG2R, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG12R + PG54O, data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ PG2R + PG9R  + TT + TR54S5 + TPG5S + TS4R54, data = d_train, type = "class") #(1,3,6,7,8,9)
#vgp_nb <- naiveBayes(VGPR ~ , data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ , data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ , data = d_train, type = "class")
#vgp_nb <- naiveBayes(VGPR ~ , data = d_train, type = "class")



#vgp_nb



In [None]:
#                                Prediction
#  1      2     3       4       5      6     7        8       9    
#PG2R' 'PG6R' 'PG9R' 'PG12R' 'PG54O' 'TT' 'TR54S5' 'TPG5S' 'TS4R54' 'VGPR'

options(digits = 2)

res <- c('Essential', 'High Priority' , 'Low Priority',  'Medium Priority', 'Not a Priority')

#x_datar <- d_test[,1:9] # all
#x_datar <- d_test[,1:5]  # just responses
x_datar <- d_test[,1:2]     # 
#x_datar <- d_test[,6:9] # just times
#x_datar <- d_test[,1:3]    #PG2 + 6r + 9r
#x_datar <- d_test[,4:5]  #PG12 + PG54O
#x_datar <- d_test[,3]    #PG9
#x_datar <- d_test[,4]    #PG12
#x_datar <- d_test[,5]    #PG54O
#x_datar <- d_test[,6]    #TT
#x_datar <- d_test[,7]    #TR54S5
#x_datar <- d_test[,8]    #TPG5S
#x_datar <- d_test[,9]    #TS4R54

#x_datar <- data.frame(d_test[,2], d_test[,5])
#dimnames(x_datar)[[2]]<-c('PG6R', 'PG54O')
#x_datar <- make_data_frame(d_test, c('PG6R', 'PG54O', 'TT'), c(2,5,6))
#x_datar <- make_data_frame(d_test, c('PG6R', 'TT','TR54S5', 'TPG5S', 'TS4R54'), c(2,6,7, 8, 9))
#x_datar <- make_data_frame(d_test, c('PG2R', 'TT','TR54S5', 'TPG5S', 'TS4R54'), c(1,6,7, 8, 9))**
#x_datar <- make_data_frame(d_test, c('PG9R', 'TT','TR54S5', 'TPG5S', 'TS4R54'), c(3,6,7, 8, 9))
#x_datar <- make_data_frame(d_test, c('PG12R', 'TT','TR54S5', 'TPG5S', 'TS4R54'), c(4,6,7, 8, 9))
#x_datar <- make_data_frame(d_test, c('PG54O', 'TT','TR54S5', 'TPG5S', 'TS4R54'), c(5,6,7, 8, 9))
#x_datar <- make_data_frame(d_test, c('PG2R', 'PG9R', 'TT','TR54S5', 'TPG5S', 'TS4R54'), c(1,3,6,7, 8, 9)) #Acc : 0.321168
#x_datar

#print('test df')
#test_df

y_datar <- d_test[, 10]


preee3 <- predict(vgp_nb, newdata = x_datar, type = "raw")
#preee3 <- predict(vgp_nb, newdata = d_test, type="raw" )

#preee3

p_lr <-get_predictions(preee3, res)

#p_lr

#pridd_dfr = data.frame(cbind(p_lr, y_datar))
#pridd_dfr

ur <- union(p_lr, y_data)
tr <- table(factor(p_lr, ur), factor(y_datar, ur))
print('The table:')
tr
print('---------------------------------------------------------------')
print('The confustion Matrix:')
confusionMatrix(tr)

#confusionMatrix(table(preee3, d_test$VGPR))

In [396]:
#str(data)

In [397]:
#                                                 Training set creatation

# options(stringsAsFactors = FALSE)
# create a partioned set of training and test data
# make a data frame of my cleaned data
train_df <- data.frame(cbind(PG2R[1:360], PG6R[1:360], PG9R[1:360], PG12R[1:360], PG54O[1:360],
                          TT[1:360], TR54S5[1:360],TPG5S[1:360], TS4R54[1:360], PG54R[1:360]))

#train_df <- data.frame(cbind(PG2R, PG6R, PG9R, PG12R, PG54O,
#                          TT, TR54S5,TPG5S, TS4R54, PG54R))

print(length(train_df))
dimnames(train_df)[[2]] <- c('PG2R','PG6R','PG9R','PG12R','PG54O','TT','TR54S5','TPG5S', 'TS4R54','VGPR'  ) 
print(train_df)

[1] 10
        PG2R              PG6R         PG9R       PG12R PG54O    TT TR54S5
1         No Less than 2 years        4 - 6     55 - 64     4   270     48
2        Yes     13 - 19 years      11 - 15     45 - 54     3   149     30
3         No       2 - 5 years      11 - 15     35 - 44     9   132     13
4   Not sure       6 - 8 years        4 - 6     18 - 24     8   185      9
5         No       2 - 5 years        4 - 6     35 - 44     4   178     27
6        Yes       6 - 8 years        4 - 6     35 - 44     2   237     42
7         No       6 - 8 years        4 - 6     25 - 34     2   175     24
8        Yes Less than 2 years        2 - 3     18 - 24    13   374      7
9        Yes       2 - 5 years       7 - 10     35 - 44     8  1119     49
10        No       2 - 5 years More than 25     25 - 34    13   419      2
11  Not sure Less than 2 years        4 - 6     35 - 44     4   221     39
12  Not sure       2 - 5 years        4 - 6     25 - 34     3   290     72
13        No      

In [401]:
train_df$TT <- as.integer(train_df$TT)
train_df$TR54S5 <- as.integer(train_df$TR54S5)
train_df$TPG5S <- as.integer(train_df$TPG5S)
train_df$TS4R54 <-as.integer(train_df$TS4R54)


str(train_df)



'data.frame':	360 obs. of  10 variables:
 $ PG2R  : chr  "No" "Yes" "No" "Not sure" ...
 $ PG6R  : chr  "Less than 2 years" "13 - 19 years" "2 - 5 years" "6 - 8 years" ...
 $ PG9R  : chr  "4 - 6" "11 - 15" "11 - 15" "4 - 6" ...
 $ PG12R : chr  "55 - 64" "45 - 54" "35 - 44" "18 - 24" ...
 $ PG54O : chr  "4" "3" "9" "8" ...
 $ TT    : int  270 149 132 185 178 237 175 374 1119 419 ...
 $ TR54S5: int  48 30 13 9 27 42 24 7 49 2 ...
 $ TPG5S : int  81 53 55 36 68 62 61 90 115 120 ...
 $ TS4R54: int  33 23 42 27 41 20 37 83 66 118 ...
 $ VGPR  : chr  "High Priority" "Not a Priority" "Medium Priority" "Not a Priority" ...


In [458]:
#                          Test sets

# make a data frame of my cleaned data
test_df <- data.frame(cbind(PG2R[361:441], PG6R[361:441], PG9R[361:441], PG12R[361:441], PG54O[361:441],
                          TT[361:441], TR54S5[361:441],TPG5S[361:441], TS4R54[361:441], PG54R[361:441]))

#test_df <- data.frame(cbind(PG2R, PG6R, PG9R, PG12R, PG54O,
#                          TT, TR54S5,TPG5S, TS4R54, PG54R))

print(length(test_df))
dimnames(test_df)[[2]] <- c('PG2R','PG6R','PG9R','PG12R','PG54O','TT','TR54S5','TPG5S', 'TS4R54','VGPR'  ) 
print(test_df)
nrow(test_df)
ncol(test_df)

[1] 10
       PG2R              PG6R         PG9R   PG12R PG54O   TT TR54S5 TPG5S
1       Yes       2 - 5 years      11 - 15 25 - 34     5 5088    101   155
2  Not sure       2 - 5 years      11 - 15 45 - 54    13  212      7    71
3  Not sure       2 - 5 years        4 - 6 25 - 34     6  193     30    77
4        No Less than 2 years        2 - 3 25 - 34     8  826      0    33
5        No       2 - 5 years        4 - 6 25 - 34     3  302     65    93
6        No     13 - 19 years        2 - 3 35 - 44    12  162      2    47
7  Not sure       2 - 5 years      11 - 15 25 - 34    10  193     12    62
8       Yes       2 - 5 years        2 - 3 25 - 34    10  255     11    71
9        No       2 - 5 years        2 - 3 25 - 34     6  546    105   201
10       No Less than 2 years       7 - 10 25 - 34    11  175      5    72
11      Yes       2 - 5 years        4 - 6 18 - 24     4  190     21    46
12       No       2 - 5 years       7 - 10 25 - 34     2  200     46    76
13       No       

In [496]:
test_df$TT <- as.integer(test_df$TT)
test_df$TR54S5 <- as.integer(test_df$TR54S5)
test_df$TPG5S <- as.integer(test_df$TPG5S)
test_df$TS4R54 <-as.integer(test_df$TS4R54)
#test_df$VGPR <- as.factor(test_df$VGPR)

test_df$PG2R <- as.factor(test_df$PG2R)
test_df$PG6R <- as.factor(test_df$PG6R)
test_df$PG9R <- as.factor(test_df$PG9R)
test_df$PG12R <- as.factor(test_df$PG12R)
test_df$PG54O <- as.factor(test_df$PG54O)


str(test_df)

colnames(test_df)

'data.frame':	81 obs. of  10 variables:
 $ PG2R  : Factor w/ 3 levels "No","Not sure",..: 3 2 2 1 1 1 2 3 1 1 ...
 $ PG6R  : Factor w/ 6 levels "13 - 19 years",..: 2 2 2 6 2 1 2 2 2 6 ...
 $ PG9R  : Factor w/ 6 levels "11 - 15","16 - 25",..: 1 1 4 3 4 3 1 3 3 5 ...
 $ PG12R : Factor w/ 5 levels "18 - 24","25 - 34",..: 2 4 2 2 2 3 2 2 2 2 ...
 $ PG54O : Factor w/ 14 levels "10","11","12",..: 10 4 11 13 8 3 1 1 11 2 ...
 $ TT    : int  5088 212 193 826 302 162 193 255 546 175 ...
 $ TR54S5: int  101 7 30 0 65 2 12 11 105 5 ...
 $ TPG5S : int  155 71 77 33 93 47 62 71 201 72 ...
 $ TS4R54: int  54 64 47 33 28 45 50 60 96 67 ...
 $ VGPR  : chr  "Low Priority" "Medium Priority" "Low Priority" "Not a Priority" ...


In [560]:
# create a model based on my training data frame
options(digits = 9)

print('test data rows')
nrow(test_df)
print('Training data rows')
nrow(train_df)


nb_mod <- naiveBayes(VGPR ~ PG2R + PG6R + PG9R + PG12R + PG54O, data = train_df)
#nb_mod <- naiveBayes(VGPR ~ ., data = train_df)

nb_mod


[1] "test data rows"


[1] "Training data rows"



Naive Bayes Classifier for Discrete Predictors

Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)

A-priori probabilities:
Y
      Essential   High Priority    Low Priority Medium Priority  Not a Priority 
   0.0472222222    0.2250000000    0.1750000000    0.3138888889    0.2388888889 

Conditional probabilities:
                 PG2R
Y                          No    Not sure         Yes
  Essential       0.352941176 0.235294118 0.411764706
  High Priority   0.506172840 0.234567901 0.259259259
  Low Priority    0.365079365 0.396825397 0.238095238
  Medium Priority 0.424778761 0.336283186 0.238938053
  Not a Priority  0.360465116 0.372093023 0.267441860

                 PG6R
Y                 13 - 19 years  2 - 5 years 20 years or more  6 - 8 years
  Essential        0.1176470588 0.4705882353     0.0000000000 0.1176470588
  High Priority    0.0493827160 0.4444444444     0.0493827160 0.1851851852
  Low Priority     0.0317460317 0.4444444444     0.0317460317 0.1111111111
  Mediu

In [562]:
#                             Prediction


x_data <- test_df[,1:9]
y_data <- test_df[, 10]
#x_data
prd <- predict(nb_mod, newdata = x_data, type = "raw")
#prd <- predict(nb_mod, newdata = test_df, type = "class")
#library(naivebayes)
#library(dplyr)
#library(ggplot2)
prd[1:10,]

res <- c('Essential', 'High Priority' , 'Low Priority',  'Medium Priority', 'Not a Priority')


p_l <-get_predictions(prd, res)

p_l
#ro <- prd[81,]

#print(ro)
#for (prob in prd){
#        for( i in prob){
#            print(i)
#        }
#}

#mx <- max(prd[81,])
#mx
#idx = match(mx,ro)

#print('The index')
#print(res[idx])
#summary(prd)

#test_df$VGPR

pridd_df = data.frame(cbind(p_l, y_data))
pridd_df
print('prediction number of rows')
nrow(prd)
print('prediction number of columns')
ncol(prd)

print('test number of rows')
nrow(test_df)
print('test number of cols')
ncol(test_df[,1:10])
nrow(y_data)
ncol(y_data)
length(y_data)
#length(prd)
nrow(p_l)
ncol(p_l)
length(p_l)
#length(test_df$VGPR)
#confusionMatrix(table(prd[1:10], test_df[,1:10]))
#confusionMatrix(table(p_l, y_data))


u <- union(p_l, y_data)
t <- table(factor(p_l, u), factor(y_data, u))
t
confusionMatrix(t)






Essential,High Priority,Low Priority,Medium Priority,Not a Priority
0.153274783161,0.1646053496,0.1584669902,0.295272473,0.2283804037
0.017521569228,0.0348799593,0.4023645759,0.290971984,0.2542619112
0.053122064688,0.1433771644,0.3254612319,0.34566728,0.1323722593
0.000369629902,0.0275782709,0.1675553886,0.458699865,0.3457968456
0.062683596898,0.5780448822,0.0441647796,0.279078889,0.0360278519
0.000723158809,0.1065944958,0.1350874049,0.118649129,0.6389458113
0.001289591017,0.0859918731,0.1524989018,0.359925992,0.400293642
0.001378150721,0.135427405,0.1552108704,0.343575927,0.3644076465
0.023496597655,0.1788154702,0.3219012606,0.330464002,0.1453226695
0.036346698869,0.0856168299,0.0597535781,0.51119212,0.3070907731


p_l,y_data
Medium Priority,Low Priority
Low Priority,Medium Priority
Medium Priority,Low Priority
Medium Priority,Not a Priority
High Priority,Medium Priority
Not a Priority,Low Priority
Not a Priority,Low Priority
Not a Priority,Medium Priority
Medium Priority,Medium Priority
Medium Priority,High Priority


[1] "prediction number of rows"


[1] "prediction number of columns"


[1] "test number of rows"


[1] "test number of cols"


NULL

NULL

NULL

NULL

                 
                  Medium Priority Low Priority High Priority Not a Priority
  Medium Priority               9            7             5              8
  Low Priority                  3            1             1              2
  High Priority                 8            5             5              2
  Not a Priority                9            5             3              6
  Essential                     0            0             0              0
                 
                  Essential
  Medium Priority         2
  Low Priority            0
  High Priority           0
  Not a Priority          0
  Essential               0

Confusion Matrix and Statistics

                 
                  Medium Priority Low Priority High Priority Not a Priority
  Medium Priority               9            7             5              8
  Low Priority                  3            1             1              2
  High Priority                 8            5             5              2
  Not a Priority                9            5             3              6
  Essential                     0            0             0              0
                 
                  Essential
  Medium Priority         2
  Low Priority            0
  High Priority           0
  Not a Priority          0
  Essential               0

Overall Statistics
                                              
               Accuracy : 0.259259            
                 95% CI : (0.168198, 0.368603)
    No Information Rate : 0.358025            
    P-Value [Acc > NIR] : 0.977822            
                                              
     

In [None]:


pairs.panels

In [98]:
# if you want to look at column namse

#print(col_names)
#print(resp_names)
#print(tResp5_4)

#print(ignore_l[1])

In [99]:
#print(data$)
#print(data$)
#print(data$PG9Resp)

In [34]:

#print(data[,'PG5_13Time'])