# **PLS (Partial Least Square)를 이용한 Odor 분석**

---

In [1]:
import rpy2
%load_ext rpy2.ipython

## **0. Packages Installation**

In [2]:
%%R

.libPaths("/content/drive/MyDrive/R packages")
.libPaths()

[1] "/content/drive/MyDrive/R packages" "/usr/local/lib/R/site-library"    
[3] "/usr/lib/R/site-library"           "/usr/lib/R/library"               


In [3]:
%%R

# install.packages("mice")        # M.I 결측치 대체
# install.packages("stringr")

library(mice)
library(stringr)

R[write to console]: 
Attaching package: ‘mice’


R[write to console]: The following object is masked from ‘package:stats’:

    filter


R[write to console]: The following objects are masked from ‘package:base’:

    cbind, rbind




## **1. Read and Variables Selection**

### **1) Read data (od)**

In [4]:
%%R

od <- read.csv("drive/MyDrive/PLS_양돈 농가/New_data.csv", fileEncoding = "euc-kr")
head(od)

  yy mm dd   ou  nh3   h2s  mm.1    dms  dmds    aca     ppa    iba    bta
1 18  4 24  300 0.56 145.0  0.09  12.80 0.025  30.60   17.57   2.78  10.80
2 18  4 24  300 0.62 310.0 19.40 462.00 0.025 916.52 1055.96 109.07 977.44
3 18  5  8  448 0.97  90.5  0.04  63.90 0.025 663.42  737.23  73.94 837.83
4 18  5  8 2080 1.02 344.0  0.04  13.00 0.630 490.79  570.61  51.09 719.27
5 18  5 15  310 1.18 312.0  0.04   0.04 0.025 404.53  468.03  43.17 573.75
6 18  5 15  669 0.98  61.4  0.04   0.04 0.025 349.63  347.86  36.76 582.25
     iva    vla    ph     pc   id    sk farm
1   3.64   7.72  2.84   0.03 0.20  3.00   YG
2 319.49 619.49 30.51 137.81 3.60 10.40   YG
3 215.64 469.79 14.61  67.52 0.94  3.92   YG
4 168.48 237.64  8.90  60.56 5.55  7.31   YG
5 157.22 205.67  9.21  69.90 0.20  5.43   YG
6 127.25 196.24  7.19  51.03 0.20  4.48   YG


### **2) Variables Selection (odo)**

In [5]:
%%R

odo <- subset(od, select = -c(yy, mm, dd, farm))
odo <- subset(odo, select = -c(dmds))             # dmds : 결측값이 많아 변수에서 제거(결측값 비율 90%)
# head(odo)
str(odo)

'data.frame':	57 obs. of  15 variables:
 $ ou  : num  300 300 448 2080 310 669 300 300 2080 669 ...
 $ nh3 : num  0.56 0.62 0.97 1.02 1.18 0.98 0.95 1.5 1.27 1.25 ...
 $ h2s : num  145 310 90.5 344 312 61.4 261 457 61.3 343 ...
 $ mm.1: num  0.09 19.4 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 ...
 $ dms : num  12.8 462 63.9 13 0.04 0.04 30.5 0.04 0.04 0.04 ...
 $ aca : num  30.6 916.5 663.4 490.8 404.5 ...
 $ ppa : num  17.6 1056 737.2 570.6 468 ...
 $ iba : num  2.78 109.07 73.94 51.09 43.17 ...
 $ bta : num  10.8 977.4 837.8 719.3 573.8 ...
 $ iva : num  3.64 319.49 215.64 168.48 157.22 ...
 $ vla : num  7.72 619.49 469.79 237.64 205.67 ...
 $ ph  : num  2.84 30.51 14.61 8.9 9.21 ...
 $ pc  : num  0.03 137.81 67.52 60.56 69.9 ...
 $ id  : num  0.2 3.6 0.94 5.55 0.2 0.2 0.5 5.3 0.49 0.53 ...
 $ sk  : num  3 10.4 3.92 7.31 5.43 4.48 5.17 9.32 8.64 3.37 ...


## **2. 결측치 대체**

- ND　→　MDL/2

- NA　→　M.I(Mutiple Imputation) method 

### **1) MDL값 불러오기**

- MDL = 물질 검출 한계 = Method Detection Limit　:　ND로 측정된 데이터를 MDL/2값으로 대체

In [6]:
%%R

MDL <- read.csv("drive/MyDrive/PLS_양돈 농가/MDL.csv", fileEncoding = "euc-kr")
MDL <- subset(MDL, select = -c(dmds))
names(MDL) <- colnames(odo[2:length(odo)])
MDL

   nh3  h2s mm.1  dms  aca  ppa  iba  bta  iva  vla   ph   pc  id   sk
1 0.08 0.06 0.07 0.08 0.07 0.34 0.52 0.93 0.49 0.53 0.09 0.06 0.4 0.38


### **2) 결측치 대체**

In [7]:
%%R

### 공백을 NA값 취급
odo <- replace(odo, odo == " ", NA)


### 결측치 확인
nd <- sum(str_count(odo, "ND"))
na <- sum(is.na(odo))


### 결측치 대체
if ((nd + na) == 0) {
    print("결측치 개수 : 0개")
} else {
    
    # "ND" 먼저 대체 (그래야 ND와 문자열 공백인 NA 구분이 가능)
    cat("ND 개수 :", nd, "개")
    for (i in 2:length(odo)) {
         odo[,i] <- replace(odo[,i], odo[,i] == as.numeric(MDL[colnames(odo)[i]]/2), 0)
    }
    ndr <- sum(str_count(odo, "ND"))    # 대체 후 ND 개수
    cat("->", ndr, "개 (MDL/2로 대체)")


    # 문자열을 숫자열로 변환 (공백 문자를 NA로 처리하기 위해)
    indx <- sapply(odo, is.character)
    odo[indx] <- lapply(odo[indx], function(x) as.numeric(as.character(x)))

    cat("\nNA 개수 :", na, "개")
    imp <- mice(odo)            # M.I 사용
    odo <- complete(imp)
    nar <- sum(is.na(odo))      # 대체 후 NA 개수
    cat("→", nar, "개 (M.I로 대체)")

}


[1] "결측치 개수 : 0개"


In [8]:
%%R

head(odo)

    ou  nh3   h2s  mm.1    dms    aca     ppa    iba    bta    iva    vla    ph
1  300 0.56 145.0  0.09  12.80  30.60   17.57   2.78  10.80   3.64   7.72  2.84
2  300 0.62 310.0 19.40 462.00 916.52 1055.96 109.07 977.44 319.49 619.49 30.51
3  448 0.97  90.5  0.04  63.90 663.42  737.23  73.94 837.83 215.64 469.79 14.61
4 2080 1.02 344.0  0.04  13.00 490.79  570.61  51.09 719.27 168.48 237.64  8.90
5  310 1.18 312.0  0.04   0.04 404.53  468.03  43.17 573.75 157.22 205.67  9.21
6  669 0.98  61.4  0.04   0.04 349.63  347.86  36.76 582.25 127.25 196.24  7.19
      pc   id    sk
1   0.03 0.20  3.00
2 137.81 3.60 10.40
3  67.52 0.94  3.92
4  60.56 5.55  7.31
5  69.90 0.20  5.43
6  51.03 0.20  4.48


## **3. CSV파일로 저장 (New_data_preprocessed.csv)**

In [13]:
%%R

write.csv(odo, file = "drive/MyDrive/PLS_양돈 농가/New_data_preprocessed.csv",
          row.names = F)  # row.names = F : 행의 이름 생략

## **4. 저장된 파일 확인**

In [14]:
%%R

check_odo <- read.csv("drive/MyDrive/PLS_양돈 농가/New_data_preprocessed.csv", fileEncoding = "euc-kr")

In [15]:
%%R

head(check_odo)

    ou  nh3   h2s  mm.1    dms    aca     ppa    iba    bta    iva    vla    ph
1  300 0.56 145.0  0.09  12.80  30.60   17.57   2.78  10.80   3.64   7.72  2.84
2  300 0.62 310.0 19.40 462.00 916.52 1055.96 109.07 977.44 319.49 619.49 30.51
3  448 0.97  90.5  0.04  63.90 663.42  737.23  73.94 837.83 215.64 469.79 14.61
4 2080 1.02 344.0  0.04  13.00 490.79  570.61  51.09 719.27 168.48 237.64  8.90
5  310 1.18 312.0  0.04   0.04 404.53  468.03  43.17 573.75 157.22 205.67  9.21
6  669 0.98  61.4  0.04   0.04 349.63  347.86  36.76 582.25 127.25 196.24  7.19
      pc   id    sk
1   0.03 0.20  3.00
2 137.81 3.60 10.40
3  67.52 0.94  3.92
4  60.56 5.55  7.31
5  69.90 0.20  5.43
6  51.03 0.20  4.48
