A Survey on Technology Choice
======


In [2]:
# For nicer printing
options(digits=2);

In [423]:
# Read in the data
data <- read.csv("TechSurvey - Survey.csv",header=T, stringsAsFactors=TRUE);

data[is.na(data)] <- 0 


#convert date to unix second to time calculations are easier for the simple questions
for (i in c("Start", "End"))
    data[,i] = as.numeric(as.POSIXct(strptime(data[,i], "%Y-%m-%d %H:%M:%S")))

time <- c() #I created this list to hold all the time differences from start and end so that I can take the average of
            #this list later to find the average time in seconds to complete the survey
for (i in 1:nrow(data))
{
    if(!is.na(data[i,"End"]) && !is.na(data[i,"Start"]))
    {
        diff = data[i,"End"] - data[i,"Start"]
        time <- c(time, diff)
    }
}

print("Average time to complete survey in seconds")
print(mean(time))


#Here we are converting all the "Submit" time columns to unix seconds as well for easier calculations
for (i in 0:12){
    vnam = paste(c("PG",i,"Submit"), collapse="")
    data[,vnam] = as.numeric(as.POSIXct(strptime(data[,vnam], "%Y-%m-%d %H:%M:%S")))
}


#calculate differences in time from each question to the next question     
for (i in 12:0){
    pv = paste(c("PG",i-1,"Submit"), collapse="");
    #print(pv)
    if (i==0) 
        pv="Start";
    vnam = paste(c("PG",i,"Submit"), collapse="");
    data[,vnam] = data[,vnam] -data[,pv];
    
}


t0 <- data[,"PG0Submit"]
t1 <- data[,"PG1Submit"]
t2 <- data[,"PG2Submit"]
t3 <- data[,"PG3Submit"]
t4 <- data[,"PG4Submit"]
t5 <- data[,"PG5Submit"]
t6 <- data[,"PG6Submit"]
t7 <- data[,"PG7Submit"]
t8 <- data[,"PG8Submit"]
t9 <- data[,"PG9Submit"]
t10 <- data[,"PG10Submit"]
t11 <- data[,"PG11Submit"]
t12 <- data[,"PG12Submit"]

#I am concatenating a list of the average times for each question submit time and then will take the maximum and minimum
#time to answer the simple questions
means <- c()
means <- c(means, mean(t0, na.rm = TRUE))
means <- c(means, mean(t1, na.rm = TRUE))
means <- c(means, mean(t2, na.rm = TRUE))
means <- c(means, mean(t3, na.rm = TRUE))
means <- c(means, mean(t4, na.rm = TRUE))
means <- c(means, mean(t5, na.rm = TRUE))
means <- c(means, mean(t6, na.rm = TRUE))
means <- c(means, mean(t7, na.rm = TRUE))
means <- c(means, mean(t8, na.rm = TRUE))
means <- c(means, mean(t9, na.rm = TRUE))
means <- c(means, mean(t10, na.rm = TRUE))
means <- c(means, mean(t11, na.rm = TRUE))
means <- c(means, mean(t12, na.rm = TRUE))

print('---------------------')
print("PG0 took the longest time to answer (sec)")
print(max(means))

print('---------------------')
print("PG11 took the shortest time to answer (sec)")
print(min(means))

submits <- list(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12)

#Remove the time columns from the data set
tlist <- c()
for (i in colnames(data))
    if(grepl('*Submit', i) || grepl('*Time', i))
    {
        tlist <- c(tlist, i)
    }else 
    {
        
    }
    
tlist <- c(tlist, "Start", "End")
data = data[ , !names(data) %in% tlist] 

#Change all values to numeric encodings
for(i in colnames(data))
{
    data[,i] <- as.numeric (data[,i]);
    #print(i)
    #print(unique(data[,i]))
}

#Get the count of each ranking for each response variable to find the highest ranked criteria to answer the simple
#question
c1 <- table(data[, 'PG5_1RRPQ'])
c2 <- table(data[, 'PG5_2BNUI'])
c3 <- table(data[, 'PG5_3HDS'])
c4 <- table(data[, 'PG5_4VGP'])
c5 <- table(data[, 'PG5_5PHR'])
c6 <- table(data[, 'PG5_6SSYOP'])
c7 <- table(data[, 'PG5_7NDYP'])
c8 <- table(data[, 'PG5_8CP'])
c9 <- table(data[, 'PG5_9FRP'])
c10 <- table(data[, 'PG5_10RPA'])
c11 <- table(data[, 'PG5_11NSG'])
c12 <- table(data[, 'PG5_12NWG'])
c13 <- table(data[, 'PG5_13NFG'])

print('PG5_1RRPQ')
print(c1)
print('PG5_2BNUI')
print(c2)
print('PG5_3HDS')
print(c3)
print('PG5_4VGP')
print(c4)
print('PG5_5PHR')
print(c5)
print('PG5_6SSYOP')
print(c6)
print('PG5_7NDYP')
print(c7)
print('PG5_8CP')
print(c8)
print('PG5_9FRP')
print(c9)
print('PG5_10RPA')
print(c10)
print('PG5_11NSG')
print(c11)
print('PG5_12NWG')
print(c12)
print('PG5_13NFG')
print(c13)

#get numeric fields only for correlation
sel = c() 
for (i in 1:dim(data)[2]) if (is.numeric(data[,i])) sel = c(sel, i);
cor(data[,sel], data[,'PG5_4VGP'], method="spearman",use="pairwise.complete.obs"); #OK for any: uses ranks
#cor(data[,sel], method="spearman",use="pairwise.complete.obs");


train1 <- data[0:1082,]
test <- data[1083:1353,]

growth <- train1['PG5_4VGP']
tgrowth <- test['PG5_4VGP']

remove <- c('Device', 'PG0Dis', 'PG0Shown', 'PG1PsnUse', 'PG4AllResp', 'PG1WdAuth','PG7C.C..', 'PG4AllResp')
formodel = train1[ , !names(train1) %in% remove]


remove <- c('PG5_4VGP','Device', 'PG0Dis', 'PG0Shown', 'PG1PsnUse', 'PG4AllResp', 'PG1WdAuth','PG7C.C..', 'PG4AllResp')
train = train1[ , !names(train1) %in% remove]
test = test[ , !names(test) %in% remove]

trainmatrix <- as.matrix(train)
testmatrix <- as.matrix(test)

library(MASS)

m = nrow(train)
x0 = matrix(0, m, 1)
XT = t(trainmatrix)
X = trainmatrix
Y = as.matrix(growth)
XTX = XT %*% X
inv=ginv(XTX, tol = sqrt(.Machine$double.eps))
newB=(inv %*% XT) %*% Y

TestX = testmatrix
TestY = as.matrix(tgrowth)

Y_pred = TestX %*% newB

for (x in 1:length(Y_pred))
{
    #Y_pred[x] = as.integer (Y_pred[x] + 0.5)
    Y_pred[x] = as.integer (Y_pred[x])
}

#print(Y_pred)

correct = 0
for (i in 1:length(Y_pred))
{
    if(Y_pred[i] == TestY[i])
    {
        correct = correct + 1
    }
}

acc = (correct/length(Y_pred)) * 100
print('---------------------')
print("Accuracy of model is")
print(acc)

#library(ggplot2)
#data$PG11Resp <- as.factor(data$PG11Resp)
#ggplot(data, aes(x = PG12Resp)) + geom_histogram() + facet_wrap(~PG11Resp)


model <- lm(PG5_4VGP ~ ., data = formodel)
summary(model)


#str(data)



[1] "Average time to complete survey in seconds"
[1] 680.3565
[1] "---------------------"
[1] "PG0 took the longest time to answer (sec)"
[1] 299.3466
[1] "---------------------"
[1] "PG11 took the shortest time to answer (sec)"
[1] 6.368944
[1] "PG5_1RRPQ"

  1   2   3   4   5   6 
877  60 102  85 134  95 
[1] "PG5_2BNUI"

  1   2   3   4   5   6 
923   3  26 121  92 188 
[1] "PG5_3HDS"

  1   2   3   4   5   6 
768 103 200  69 162  51 
[1] "PG5_4VGP"

  1   2   3   4   5   6 
852  22 111  88 164 116 
[1] "PG5_5PHR"

  1   2   3   4   5   6 
753  79 252  63 162  44 
[1] "PG5_6SSYOP"

  1   2   3   4   5   6 
852  63 137  84 110 107 
[1] "PG5_7NDYP"

  1   2   3   4   5   6 
934   8  31  93  52 235 
[1] "PG5_8CP"

  1   2   3   4   5   6 
715 232 197  52 121  36 
[1] "PG5_9FRP"

  1   2   3   4   5   6 
738 165 243  42 125  40 
[1] "PG5_10RPA"

  1   2   3   4   5   6 
779  55 204  79 151  85 
[1] "PG5_11NSG"

  1   2   3   4   5   6 
890   6  29  89  68 271 
[1] "PG5_12NWG"

  1   2  

0,1
Device,-0.0741707134
Completed,0.5905347582
PG0Dis,0.0008795874
PG0Shown,-0.0009939303
PG1PsnUse,0.2344926007
PG1WdAuth,0.1331765808
PG1Trn,0.1062784123
PG1Other,0.0240152442
PG2Resp,0.3691662793
PG2Resp.1,0.4307481087


[1] "---------------------"
[1] "Accuracy of model is"
[1] 70.1107



Call:
lm(formula = PG5_4VGP ~ ., data = formodel)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.6294 -0.3450 -0.0326  0.1026  3.7305 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -1.0083480  0.8152896  -1.237 0.216443    
Completed      0.3396497  0.2395634   1.418 0.156554    
PG1Trn         0.0615521  0.0842741   0.730 0.465323    
PG1Other      -0.0098307  0.0036208  -2.715 0.006737 ** 
PG2Resp       -0.0013720  0.0330290  -0.042 0.966874    
PG2Resp.1     -0.0005726  0.0005604  -1.022 0.307146    
PG4Dtr0_6      0.0067233  0.0280819   0.239 0.810828    
PG4Psv7_8      0.0019389  0.0153969   0.126 0.899811    
PG4Prm9_10    -0.0049726  0.0112601  -0.442 0.658861    
PG5_1RRPQ     -0.0627224  0.0341669  -1.836 0.066679 .  
PG5_1Order     0.0146009  0.0127994   1.141 0.254239    
PG5_2BNUI      0.0507324  0.0378838   1.339 0.180813    
PG5_2Order     0.0084029  0.0115879   0.725 0.468529    
PG5_3HDS       0.0299083  0.0282621   1.05

### Explanation and Thoughts

The first step in analyzing this data was to import the csv file and read it into a data frame. Before any data modeling can happen, the data has to be cleaned up and free of abnormalities. The first step in cleaning was to convert all datetime variables into unix seconds so calculations would be easier. All the columns such as start, end, *submit, and *time were converted to seconds. Then to analyze the data further, the time difference was taken between each question submit. This would allow us to see which question took the longest and shortest to answer across the entire survey. I did this by calculating the mean of each question submit column while ignoring the Nan values and concatenating this value to a list. At the end of the iteration, 13 values would be in this list and I found the maximum and minimum value from this list representing the longest and shortest question. The question that took the longest was PG0 and the question that took the shortest amount of time was PG11. After using the time data to answer these questions, it was time to remove these columns from the data. I removed these because time data would not help the model as time is a unique and independent characteristic to each sample and does not have any correlation with the actual data. The time would just skew the results also becuase of the high value in unix second compared to the other low values. After removing all the columns that were related to time, I was finally able to encode each of the unique values in each column to a numerical attribute. Text attribute is harder to use for linear regression model, so that is why I had to map each value to its own numerical value to give it a weight. After that I analyzed each column and got the count value of each unique value in each of the response variable columns, so I could see which criteria was highest ranked. After using the table() command on each of the 13 response variable columns, I found that PG5_8CP (Computing Performances)  had the highest number of "Essential" rankings compared to the others with a count of 232 responses. My encoding for the category "Essential" was 2.

Now I think it was time to begin modeling the data. I used multiple linear regression for this model since all my attributes were numerical. Since this would be a prediction problem, I split up my data in a training and testing set 80/20. Then I extracted the target variable column values for later use with predictions and testing. Now I had to remove the target column from the training and testing set so my model would work completely with the independent variables. I created a model in two ways, by using the normal equation and by the lm() function in R. The normal equation is (X^T * X)^-1 * (X^T * Y). By using this normal matrix equation, I was able to get the coefficients for my linear regression. For X, I used my original training set in matrix form and for Y, I used the target column that I extracted earlier. Once I got my coefficients, I was able to do a dot product of my coefficients and my test set to get a prediction set of values. Using this prediction set of my target variable, I compared the values and was able to calculate the accuracy. The accuracy of my model was about 70%. I think that this is good especially for forcing this data to conform to a linear regression model.

The second way I modeled the data was by using the built in lm() function in R. This linear model function is used to simply fit the data provided to a linear model. I passed in the target variable first and then the rest of the data into the function. Looking at the model by calling the summary() function on it, I got different metrics. First of all, I get the residuals. The residuals just explain the difference betweent the actual observed response values and the predicted response values from the model. It is best to have a symmetrical distribution in the 5 sections presented: Min, 1Q, Median, 3Q, Max. Based on my results, I believe to have a fairly good symmetrical relation that supports the model. The next important metric to look at is the R^2 value. This value just shows how well the model actually fits the data. The closer to 1 the value, the better. In this case, the R^2 value was about 0.78 which is pretty good. In conclusion, we can say that our multiple linear regression model fit the data pretty well. I think the priority is going to be affected by mainly all other variables except the timing variables and the variables: 'Device', 'PG0Dis', 'PG0Shown', 'PG1PsnUse', 'PG4AllResp', 'PG1WdAuth','PG7C.C..', 'PG4AllResp'.  I had to play around and remove some columns one by one to get the best accuracy. The variables that were used in this model are identified above from running the lm() function. The variables I chose to discard from the model are: 'Device', 'PG0Dis', 'PG0Shown', 'PG1PsnUse', 'PG4AllResp', 'PG1WdAuth','PG7C.C..', 'PG4AllResp'. I chose to remove these values because looking at the correlation table compared to my response variable, PG5_4VGP. The values were very low. Also, some of these columns had many different unique values that mapped to several numeric values. I thought it would skew the model by having too large of numeric weights, so I decided to leave them out. 

Interpret correlations: onlys start vs End, calculate differene instead


### Simple questions

- Time to take entire survey?
- Question that took the longest to complete?
- Question that took the least time?
- Top-ranked criteria?
- Demographic distribution by age?