-
Notifications
You must be signed in to change notification settings - Fork 0
/
AutoData_Code.R
513 lines (405 loc) · 16 KB
/
AutoData_Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
autodata<- read.csv("C:\\Users\\deepp\\Google Drive\\cleanDataAuto.csv",
header=T,na.strings="?")
names(autodata)
head(autodata)
dim(autodata)
autodata<-na.omit(autodata) #verifies no row contains misssing numerical data
dim(autodata) #gives total number of rows & columns
nrow(autodata) #gives total number of N cases= number of numerical rows
## mpg= miles per gallon will be target variable
## the 5 explanatory variables or features F1, F2, F3, F4, F5 given by columns 2 3 4 5 6
# we will denote them F1= cyl, F2= dis, F3=hor, F4=wei, F5=acc
Y = mpg <- autodata$mpg #Target/Response variable
F1 = cyl <- autodata$cylinders
F2 = dis <- autodata$displacement
F3 = hor <- autodata$horsepower
F4 = wei <- autodata$weight
F5 = acc <- autodata$acceleration
#_------------------------------------------------------------------------------------
###### Part 1 mean & sd ######
# mean and standard deviation of cylinders
mean(F1)
sd(F1)
#mean and standard deviation of displacement
mean(F2)
sd(F2)
#mean and standard deviation of horsepower
mean(F3)
sd(F3)
#mean and standard deviation of weight
mean(F4)
sd(F4)
#mean and standard deviation of acceleration
mean(F5)
sd(F5)
#-------------------------------------------------------------------------------------------------
dev.off()
###### Part 2 Histograms #######
par(mfrow = c(3, 2))
hist(F1, breaks = 10,
main = "Histogram of cylinders",
xlab = "cyl (F1)",
col = "light blue") #histogram of cylinders
hist(F2, breaks = 10,
main = "Histogram of displacement",
xlab = "dis (F2)",
col = "orange") #histogram of displacement
hist(F3, breaks = 10,
main = "Histogram of horsepower",
xlab = "hor (F3)",
col = "light yellow") #histogram of horsepower
hist(F4, breaks = 10,
main = "Histogram of weight",
xlab = "wei (F4)",
col = "purple") #histogram of weight
hist(F5, breaks = 10,
main = "Histogram of acceleration",
xlab = "acc (F5)",
col = "light pink") #histogram of acceleration
hist(mpg, breaks = 10,
main = "Miles per gallon (mpg)",
xlab = "mpg",
col = "light green") #histogram of miles per gallon
#NOTE: reprint hist F4 & F5 - error in labels
#probability density function- trying to draw curve over histogram (refer to pnormplot in 6358 Quiz 1?)
#x1<- dnorm(F1, mean = 5.47, sd=1.706)
#plot(F1,x1)
#lines(F1,x1)
#-----------------------------------------------------------------------------------------
######## Part #3 Scatterplots ########
#Scatterplot of mpg vs. cylinder
plot(cyl,mpg,
pch = 3, # plus sign
cex = 1, # 100% size
col = "blue", # Blue color
main = "mpg vs. cylinder",
xlab = "cylinders (cyl)",
ylab = "miles per gallon (mpg)")
#Scatterplot of mpg vs. displacement
plot(dis,mpg,
pch = 2, # triangle
cex = 1, # 100% size
col = "orange", #Orange color
main = "mpg vs. displacement",
xlab = "displacement (dis)",
ylab = "miles per gallon (mpg)")
#Scatterplot of mpg vs. horsepower
plot(hor,mpg,
pch = 1, # circles
cex = 1.2, # 120% size
col = "gold1", #Golden yellow color
main = "mpg vs. horsepower",
xlab = "horsepower (hor)",
ylab = "miles per gallon (mpg)")
#Scatterplot of mpg vs. weight
plot(wei,mpg,
pch = 4, # x marks
cex = 1.5, # 150% size
col = "purple", #purple color
main = "mpg vs. weight",
xlab = "weight (wei)",
ylab = "miles per gallon (mpg)")
#Scatterplot of mpg vs. acceleration
plot(acc,mpg,
pch = 5, # diamonds
cex = 1.5, # 150% size
col = "deeppink3", #pink color
main = "mpg vs. acceleration",
xlab = "acceleration (acc)",
ylab = "miles per gallon (mpg)")
#---------------------------------------------------------------------------------------------
#### Part #4 Correlations ####
cor(cyl,mpg)
cor(dis,mpg)
cor(hor,mpg)
cor(wei,mpg)
cor(acc,mpg)
#-------------------------------------------------------------------------------------------------
### Part #5 Correlation Matrix
install.packages("corrplot")
source("http://www.sthda.com/upload/rquery_cormat.r")
library(corrplot)
head(autodata)
features<- autodata[,c("cylinders","displacement","horsepower","weight","acceleration")]
head(features)
rquery.cormat(features, type="full")
#-----------------------------------------------------------------------------------------
###### Part #6: Quantile plot #########
percentile<- seq(0.01,1,by=0.01)
qmpg<- quantile(mpg, percentile)
qmpg
Y = mpg
n=length(Y)
plot((1:n-1)/(n-1),sort(Y),type="l",
col="red",
main="Quantile curve for miles per gallon (mpg)",
xlab="Percentile x%",
ylab="quantile value for mpg")
#-----------------------------------------------------------------------------------------
##### Part 7: Extracting Data ####
percentile<- seq(0.01,1,by=0.01)
qmpg<- quantile(mpg, percentile) #sorts mpg data from 1% to 100% percentile
qmpg.data<- data.frame(qmpg)
head(qmpg.data)
dim(qmpg.data)
q33<- qmpg.data[33,] #recalls value at 33% percentile
#as the mpg data is sorted 1% to 100%, 33rd entry is equivalent to 33%percentile
q66<- qmpg.data[66,] #recalls value at 66% percentile
filter.q33<- autodata$mpg<=q33
filter.q33
LOWmpg<- autodata[filter.q33,]
head(LOWmpg)
filter.q66<- autodata$mpg>q66
HIGHmpg<- autodata[filter.q66,]
head(HIGHmpg)
#--------------------------------------------------------------------------------------
#### Part 8: Histogram of LOWmpg & HIGHmpg ##########
Y.low<- LOWmpg$mpg
F1low= cyl.low <- LOWmpg$cylinders
F2low= dis.low <- LOWmpg$displacement
F3low= hor.low <- LOWmpg$horsepower
F4low= wei.low <- LOWmpg$weight
F5low= acc.low <- LOWmpg$acceleration
Y.high<- HIGHmpg$mpg
F1high= cyl.high <- HIGHmpg$cylinders
F2high= dis.high <- HIGHmpg$displacement
F3high= hor.high <- HIGHmpg$horsepower
F4high= wei.high <- HIGHmpg$weight
F5high= acc.high <- HIGHmpg$acceleration
dev.off()
# Put graphs in 5 rows and 2 column for comparison
par(mfrow = c(5, 2))
hist(F1low, breaks = 10,
main = "Histogram of cylinders with LOWmpg cases",
xlab = "cyl",
col = "light blue") #LOWmpg cases histogram of cylinders
hist(F1high, breaks = 10,
main = "Histogram of cylinders with HIGHmpg cases",
xlab = "cyl",
col = "light blue") #HIGHmpg cases histogram of cylinders
hist(F2low, breaks = 10,
main = "Histogram of displacement with LOWmpg cases",
xlab = "dis",
col = "orange") #LOWmpg cases histogram of displacement
hist(F2high, breaks = 10,
main = "Histogram of displacement with HIGHmpg cases",
xlab = "dis",
col = "orange") #HIGHmpg cases histogram of displacement
hist(F3low, breaks = 10,
main = "Histogram of horsepower with LOWmpg cases",
xlab = "hor",
col = "light yellow") #LOWmpg cases histogram of horsepower
hist(F3high, breaks = 10,
main = "Histogram of horsepower with HIGHmpg cases",
xlab = "hor",
col = "light yellow") #HIGHmpg cases histogram of horsepower
hist(F4low, breaks = 10,
main = "Histogram of weight with LOWmpg cases",
xlab = "wei",
col = "purple") #LOWmpg cases histogram of weight
hist(F4high, breaks = 10,
main = "Histogram of weight with HIGHmpg cases",
xlab = "wei",
col = "purple") #HIGHmpg cases histogram of weight
hist(F5low, breaks = 10,
main = "Histogram of acceleration with LOWmpg cases",
xlab = "acc",
col = "light pink") #LOWmpg cases histogram of acceleration
hist(F5high, breaks = 10,
main = "Histogram of acceleration with HIGHmpg cases",
xlab = "acc",
col = "light pink") #HIGHmpg cases histogram of acceleration
#---------------------------------------------------------------------------------------
##### Part 9: mean mL, mH & standard dev stdL, stdH for LOW & HIGH mpg ####
# mean and standard deviation of cylinders with LOWmpg cases
mLF1<- mean(F1low); mLF1
stdL1<- sd(F1low); stdL1
# mean and standard deviation of cylinders with HIGHmpg cases
mHF1<- mean(F1high); mHF1
stdH1<- sd(F1high); stdH1
#mean and standard deviation of displacement with LOWmpg cases
mLF2<- mean(F2low); mLF2
stdL2<- sd(F2low); stdL2
# mean and standard deviation of displacement with HIGHmpg cases
mHF2<- mean(F2high); mHF2
stdH2<- sd(F2high); stdH2
#mean and standard deviation of horsepower with LOWmpg cases
mLF3<- mean(F3low); mLF3
stdL3<- sd(F3low); stdL3
# mean and standard deviation of horsepower with HIGHmpg cases
mHF3<- mean(F3high); mHF3
stdH3<- sd(F3high); stdH3
#mean and standard deviation of weight with LOWmpg cases
mLF4<- mean(F4low); mLF4
stdL4<- sd(F4low); stdL4
# mean and standard deviation of weight with HIGHmpg cases
mHF4<- mean(F4high); mHF4
stdH4<- sd(F4high); stdH4
#mean and standard deviation of acceleration with LOWmpg cases
mLF5<- mean(F5low); mLF5
stdL5<- sd(F5low); stdL5
# mean and standard deviation of acceleration with HIGHmpg cases
mHF5<- mean(F5high); mHF5
stdH5<- sd(F5high); stdH5
#-------------------------------------------------------------------------------------
# for quick calculations of mean & sd
install.packages("psych")
library(psych)
describe(LOWmpg)
describe(HIGHmpg)
#--------------------------------------------------------------------------------------
NL<-nrow(LOWmpg); NL #number of cases in LOWmpg
NH<- nrow(HIGHmpg); NH #number of cases in HIGHmpg
M<-(NL+NH)/2 ; M #Denote M = NL ??? NH .
#For each feature F compute s(F) = square root [ (stdL2 + stdH2 )/M ]
####### Part 10: Using above Formula ############
sF1<- sqrt(((stdL1)^2 + (stdH1)^2)/M)
sF1
sF2<- sqrt(((stdL2)^2 + (stdH2)^2)/M)
sF2
sF3<- sqrt(((stdL3)^2 + (stdH3)^2)/M)
sF3
sF4<- sqrt(((stdL4)^2 + (stdH4)^2)/M)
sF4
sF5<- sqrt(((stdL5)^2 + (stdH5)^2)/M)
sF5
#Rough evaluation of discriminating power of features F to distinguish bewtween LOW mpg & HIGH mpg
discr.F1<- abs(mHF1 - mLF1)/sF1 #discriminating power of cylinders between LOWmpg & HIGH mpg cases
discr.F1
discr.F2<- abs(mHF2 - mLF2)/sF2 #discriminating power of displacement between LOWmpg & HIGH mpg cases
discr.F2
discr.F3<- abs(mHF3 - mLF3)/sF3 #discriminating power of horsepower between LOWmpg & HIGH mpg cases
discr.F3
discr.F4<- abs(mHF4 - mLF4)/sF4 #discriminating power of weight between LOWmpg & HIGH mpg cases
discr.F4
discr.F5<- abs(mHF5 - mLF5)/sF5 #discriminating power of acceleration between LOWmpg & HIGH mpg cases
discr.F5
#-------------------------------------------------------------------------------------------------
###### Part 11 thershold calculations ######
thr.F1<- ((mLF1*stdH1)+(mHF1*stdL1))/(stdH1+stdL1) #threshold of cylinders
thr.F1
thr.F2<- ((mLF2*stdH2)+(mHF2*stdL2))/(stdH2+stdL2) #threshold of displacement
thr.F2
thr.F3<- ((mLF3*stdH3)+(mHF3*stdL3))/(stdH3+stdL3) #threshold of horsepower
thr.F3
thr.F4<- ((mLF4*stdH4)+(mHF4*stdL4))/(stdH4+stdL4) #threshold of weight
thr.F4
thr.F5<- ((mLF5*stdH5)+(mHF5*stdL5))/(stdH5+stdL5) #threshold of acceleration
thr.F5
condition1a<- c(mHF1>mLF1,mHF2>mLF2,mHF3>mLF3,mHF4>mLF4,mHF5>mLF5)
condition1a #condition 1a is when mH>mL, score F(n)=1 when F(n)>thrF and F(n)=-1 when F(n)<=thrF
condition1b<- c(mHF1<mLF1,mHF2<mLF2,mHF3<mLF3,mHF4<mLF4,mHF5<mLF5)
condition1b #condition 1b is when mH<mL, score F(n)=1 when F(n)<thrF and F(n)=-1 when F(n)>=thrF
f5score<- ifelse(autodata$acceleration>thr.F5, 1, -1)
f5score
f1score<- ifelse(autodata$cylinders<thr.F1,1,-1)
f1score
f2score<- ifelse(autodata$displacement<thr.F2,1,-1)
f2score
f3score<- ifelse(autodata$horsepower<thr.F3, 1, -1)
f3score
f4score<- ifelse(autodata$weight<thr.F4, 1, -1)
f4score
#-------------------------------------------------------------------------------------
# Part 12
fullscore<- f1score+f2score+f3score+f4score+f5score
fullscore
autodata2<- autodata
head(autodata2)
autodata2$fullscore<- fullscore #created duplicate dataset & added column fullscore
autodata3<- autodata
autodata3$fullscore<- fullscore #created duplicate dataset & added column fullscore for backup
library(dplyr) #to use coalesce function
#True class
mpg.median<-median(autodata2$mpg); mpg.median
filter.true_high<- ifelse(autodata2$mpg>=mpg.median,"HIGHmpg", NA)
filter.true_low<- ifelse(autodata2$mpg<mpg.median,"LOWmpg", NA)
autodata2$scorehigh_true<- filter.true_high
autodata2$scorelow_true<- filter.true_low
head(autodata2)
autodata2$class.true<- coalesce(autodata2$scorehigh_true, autodata2$scorelow_true)
head(autodata2,n=25)
head(autodata2,n=25)
autodata2$scorehigh_true<-NULL
autodata2$scorelow_true<-NULL
autodata2$class.pred<-NULL
head(autodata2,n=25)
qmpg.data
#Predicted class
qmpg.data<- data.frame(qmpg); qmpg.data
q33<- qmpg.data[33,]
q66<- qmpg.data[66,]
autodata3<- autodata2
filter.q66_high<- ifelse(autodata3$mpg>q66,"HIGHmpg", NA)
filter.q33_low<- ifelse(autodata3$mpg<=q33,"LOWmpg", NA)
autodata3$TRAIN.Pred_high<- filter.q66_high
autodata3$TRAIN.Pred_low<- filter.q33_low
autodata3$class.Pred_Train<- coalesce(autodata3$TRAIN.Pred_high, autodata3$TRAIN.Pred_low)
autodata3$TRAIN.Pred_high<- NULL
autodata3$TRAIN.Pred_low<- NULL
head(autodata3,n=25)
autodata4_Train<-autodata3[!is.na(autodata3$class.Pred_Train),] # dataset without NA cases <=q33 & >q66
autodata5_Test<- autodata3[is.na(autodata3$class.Pred_Train),] #dataset with NA cases between q33 & q66
autodata4_Train$class.Pred_Train<- NULL #to remove Highmpg/LOWmpg column based on quantiles
autodata5_Test$class.Pred_Train<- NULL #to remove NA column based on quantiles
head(autodata4_Train,n=25)
head(autodata5_Test)
#TRAIN DATA SET CLASSIFIER
autodata4_Train$class.Pred_A<- NULL
A=0
filter.high<- ifelse(autodata4_Train$fullscore>=A, "HIGHmpg",NA)
filter.low<- ifelse(autodata4_Train$fullscore<A, "LOWmpg",NA)
library(dplyr)
autodata4_Train$Pred_train.high<- filter.high
autodata4_Train$Pred_train.low<- filter.low
autodata4_Train$class.Pred_A<- coalesce(autodata4_Train$Pred_train.high,autodata4_Train$Pred_train.low)
autodata4_Train$Pred_train.high<- NULL
autodata4_Train$Pred_train.low<- NULL
head(autodata4_Train)
#TEST DATA SET CLASSIFIER
autodata5_Test$class.Pred_A<- NULL #execute before changing A value
A=2
filter.high<- ifelse(autodata5_Test$fullscore>=A, "HIGHmpg",NA)
filter.low<- ifelse(autodata5_Test$fullscore<A, "LOWmpg",NA)
autodata5_Test$Pred_train.high<- filter.high
autodata5_Test$Pred_train.low<- filter.low
autodata5_Test$class.Pred_A<- coalesce(autodata5_Test$Pred_train.high,autodata5_Test$Pred_train.low)
autodata5_Test$Pred_train.high<- NULL
autodata5_Test$Pred_train.low<- NULL
head(autodata5_Test)
#----------------------------------------
#Part 13= confusion matrix
library(caret)
#Confusion matrix for TRAIN Data set
confusion.matrix_TRAIN<- with(autodata4_Train,table("True"=class.true/(nrow(autodata4_Train)), "Prediction"=class.Pred_A/(nrow(autodata4_Train)))
confusion.matrix_TRAIN
prop.table(confusion.matrix_TRAIN)*100
#Confusion matrix for TEST Data set
confusion.matrix_TEST<- with(autodata5_Test,table("True"=class.true/nrow(autodata5_Test), "Prediction"=class.Pred_A))
confusion.matrix_TEST
#-------------------------------------------------------------------------------------------------------------------------------
# #handwritten code for confusion matrix
# qll<-0
# qlh<-0
# qhl<-0
# qhh<-0
#
# for (i in 1:length(autodata4_Train$mpg)){
# if ((autodata4_Train$class.true[i]=="LOWmpg") & (autodata4_Train$class.Pred_A[i]=="LOWmpg")){
# qll<-qll+1
# }
# if ((autodata4_Train$class.true[i]=="LOWmpg") & (autodata4_Train$class.Pred_A[i]=="HIGHmpg")){
# qlh<-qlh+1
# }
# if ((autodata4_Train$class.true[i]=="HIGHmpg") & (autodata4_Train$class.Pred_A[i]=="LOWmpg")){
# qhl<-qhl+1
# }
# if ((autodata4_Train$class.true[i]=="HIGHmpg") & (autodata4_Train$class.Pred_A[i]=="HIGHmpg")){
# qhh<- qhh+1
# }
# }
#
# confusion.matrixA1<- matrix(c(qll,qhl,qlh,qhh),nrow=2,ncol =2,
# dimnames = list(c("True LOW", "True HIGH"), c("Pred LOW", "Pred HIGH")))
# confusion.matrixA1
# sum(confusion.matrixA1)