다시교재로

fixformular 문서에서 직접 작성한 분류기를 가지고 교재의 스팸 햄문서를 학습시키고 검증한 결과 분류가 안됐다...

그래서 다시 교재의 분류기에 충실하기로 하고 최대한 베이즈 확률 결과가 좋게 나오도록 해야한다.

대부분의 내용은 spamApplications문서와 동일하고 최대한 교재의 분류기를 그대로 사용하면서 좋은 결과를 내도록 살짝식 수정한 부분만 작성했다.

두 글자 이상 추출

가장 먼저 두 세글자 이상 단어만 학습하던 것을 두글자 이상으로 바꿔서 학습한 단어가 검증데이터에 많이 출현 하도록 했다. 그리고 occurrence 0.05 이상 배제 했던 것을 0.18 이상 배제했다.

인기있는 뉴스 베이즈 분류 학습시키기

pop.docs<-dir(pop.path)
all.pop<-sapply(pop.docs, function(p) get.msg(paste(pop.path,p,sep="/")))
pop.tdm<-get.tdm(all.pop)
pop.matrix<-as.matrix(pop.tdm)
pop.counts<-rowSums(pop.matrix)
pop.df<-data.frame(cbind(names(pop.counts),as.numeric(pop.counts)),stringsAsFactors=FALSE)
names(pop.df)<-c("term", "frequency")
pop.df$frequency<-as.numeric(pop.df$frequency)
pop.occurrence<-sapply(1:nrow(pop.matrix), function(i){
     length(which(pop.matrix[i,]>0))/ncol(pop.matrix)
})
pop.density<-pop.df$frequency/sum(pop.df$frequency)
pop.df<-transform(pop.df, density=pop.density,occurrence=pop.occurrence)
pop.df <- subset(pop.df, nchar(term) >= 2)
pop.df<-tail(pop.df, n = nrow(pop.df) - 50)
pop.df<-subset(pop.df, pop.df$occurrence<0.18)
head(pop.df[with(pop.df, order(-occurrence)),], n = 50)

상위 occurrence 거르기전

notcutword2pop

두 글자 이상 추출

018word2pop

인기 없는 뉴스 베이즈 분류 학습시키기

notpop.docs<-dir(notpop.path)
all.notpop<-sapply(notpop.docs, function(p) get.msg(paste(notpop.path,p,sep="/")))
notpop.tdm<-get.tdm(all.notpop)
notpop.matrix <- as.matrix(notpop.tdm)
notpop.counts<-rowSums(notpop.matrix)
notpop.df<-data.frame(cbind(names(notpop.counts),as.numeric(notpop.counts)),stringsAsFactors=FALSE)
names(notpop.df)<-c("term", "frequency")
notpop.df$frequency<-as.numeric(notpop.df$frequency)
notpop.occurrence<-sapply(1:nrow(notpop.matrix), function(i){
   length(which(notpop.matrix[i,]>0))/ncol(notpop.matrix)
})
notpop.density<-notpop.df$frequency/sum(notpop.df$frequency)
notpop.df<-transform(notpop.df, density=notpop.density,occurrence=notpop.occurrence)
notpop.df <- subset(notpop.df, nchar(term) >= 2)
notpop.df<-subset(notpop.df, notpop.df$occurrence<0.18)
head(notpop.df[with(notpop.df, order(-occurrence)),], n = 50)

상위 occurrence 거르기전

notcutword2notpop

두 글자 이상 추출

018word2notpop

분류기의 상수 c

classify.email<-function(path, training.df, prior=0.5, c=1e-5){
   msg<-get.msg(path)
   msg.tdm<-get.tdm(msg)
   msg.matrix<-as.matrix(msg.tdm)
   msg.freq<-rowSums(as.matrix(msg.matrix))
   msg.match<-intersect(names(msg.freq),training.df$term)
   if(length(msg.match)<1){
      return (prior*c^(length(msg.freq)))
   }
   else{
      match.probs<-training.df$occurrence[match(msg.match, training.df$term)]
      return (prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))
   }
}

상수 값은 검증데이터의 단어가 학습데이터에 없을 경우 의미가 크지만 계산 결과를 모두 0으로 만들어 버리는 주범이다 그래서 1e-6에서 살짝만 값을 크게해서 1e-5로 조정했다.

오류율

finalerror

오류율만 본다면 분류기가 제대로 실행되지 않은것 같다

그래프

finalgraph

그래프를 보면 비인기 뉴스는 비인기라고 제대로 분류를 하지만 인기 뉴스는 대부분 y축에 붙어있는 것으로 보아 제대로 분류가 안됐음을 알 수 있다. 이는 인기 뉴스의 학습 데이터가 적기 때문인데, 비인기 뉴스는 비교적 적은 학습 데이터로도 좋은 성능의 분류기를 만들 수 있음을 알 수 있다.

prior 변화

인기뉴스 비인기 뉴스 사전확률을 0.2, 0.8로 설정한 경우

prior28

인기 뉴스 비인기 뉴스 사전확률을 0.05, 0.95로 설정한 경우

prior0595

prior 변경에 따라 변화가 있지만 검증데이터의 크기가 작아서 변화가 미세하다.

학습데이터로 분류기 실행해 보기

자기가 학습한 데이터는 제대로 분류를 해야한다. 그러나 인기 뉴스는 제대로 분류를 못하고 있다.

notpoptest.docs <- dir(notpop.path)

poptest.docs <- dir(pop.path)

notpoptest.class <- suppressWarnings(lapply(notpoptest.docs,
  function(p)
  {
   pop.classifier(file.path(notpop.path, p))
  }))


poptest.class <- suppressWarnings(lapply(poptest.docs,
  function(p)
  {
    pop.classifier(file.path(pop.path, p))
  }))

selfexecute

prior변화

pop.classifier <-function(path){
  pr.pop<-classify.email(path,pop.df, prior = 0.2)
  pr.notpop<-classify.email(path, notpop.df, prior = 0.8)
  return(c(pr.notpop, pr.pop, ifelse(pr.pop > pr.notpop, 1, 0)))
}

bigpopprior0208

bigpoppriorgraph

학습, 검증데이터가 적은 전체 R 코드


#파일 경로

pop.path<-"../colectedData/pop"
notpop.path<-"../colectedData/notpop"
poptest.path<-"../colectedData/poptest"
notpoptest.path<-"../colectedData/notpoptest"

#라이브러리 설정

library(ggplot2)
library(tm)
library(KoNLP)

#함수들

get.msg<-function(path){
   con<-file(path, open="rt", encoding="UTF-8")
   text<-readLines(con)
   text<-noquote(text)
   text<-gsub(",","",text)
   text<-gsub("“","",text)
   text<-gsub("”","",text)
   text<-gsub("‘","",text)
   text<-gsub("’","",text)
   text<-gsub("·","",text)
   text<-gsub("ᄮix","",text)
   text<-gsub("곸쑝濡","",text)
   text<-gsub("있다”고","",text)
   text<-gsub("湲곗옄","",text)
   text<-gsub("寃껋쑝濡","",text)
   text<-gsub("寃껋씠","",text)
   Noun<-extractNoun(text)
   text<-c(Noun)
   close(con)
   return(paste(text,collapse="\n"))
}

get.tdm<-function(doc.vec){
   doc.corpus<-Corpus(VectorSource(doc.vec))
   control<-list(removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
   doc.dtm<-TermDocumentMatrix(doc.corpus,control)
   return(doc.dtm)
}
classify.email<-function(path, training.df, prior=0.5, c=1e-5){
   msg<-get.msg(path)
   msg.tdm<-get.tdm(msg)
   msg.matrix<-as.matrix(msg.tdm)
   msg.freq<-rowSums(as.matrix(msg.matrix))
   msg.match<-intersect(names(msg.freq),training.df$term)
   if(length(msg.match)<1){
      return (prior*c^(length(msg.freq)))
   }
   else{
      match.probs<-training.df$occurrence[match(msg.match, training.df$term)]
      return (prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))
   }
}
pop.classifier <-function(path){
  pr.pop<-classify.email(path,pop.df)
  pr.notpop<-classify.email(path, notpop.df)
  return(c(pr.notpop, pr.pop, ifelse(pr.pop > pr.notpop, 1, 0)))
}
#인기 뉴스 학습

pop.docs<-dir(pop.path)
all.pop<-sapply(pop.docs, function(p) get.msg(paste(pop.path,p,sep="/")))
pop.tdm<-get.tdm(all.pop)
pop.matrix<-as.matrix(pop.tdm)
pop.counts<-rowSums(pop.matrix)
pop.df<-data.frame(cbind(names(pop.counts),as.numeric(pop.counts)),stringsAsFactors=FALSE)
names(pop.df)<-c("term", "frequency")
pop.df$frequency<-as.numeric(pop.df$frequency)
pop.occurrence<-sapply(1:nrow(pop.matrix), function(i){
     length(which(pop.matrix[i,]>0))/ncol(pop.matrix)
})
pop.density<-pop.df$frequency/sum(pop.df$frequency)
pop.df<-transform(pop.df, density=pop.density,occurrence=pop.occurrence)
pop.df <- subset(pop.df, nchar(term) >= 2)
pop.df<-tail(pop.df, n = nrow(pop.df) - 50)
pop.df<-subset(pop.df, pop.df$occurrence<0.18)
head(pop.df[with(pop.df, order(-occurrence)),], n = 50)

#비인기 뉴스 학습

notpop.docs<-dir(notpop.path)
all.notpop<-sapply(notpop.docs, function(p) get.msg(paste(notpop.path,p,sep="/")))
notpop.tdm<-get.tdm(all.notpop)
notpop.matrix <- as.matrix(notpop.tdm)
notpop.counts<-rowSums(notpop.matrix)
notpop.df<-data.frame(cbind(names(notpop.counts),as.numeric(notpop.counts)),stringsAsFactors=FALSE)
names(notpop.df)<-c("term", "frequency")
notpop.df$frequency<-as.numeric(notpop.df$frequency)
notpop.occurrence<-sapply(1:nrow(notpop.matrix), function(i){
   length(which(notpop.matrix[i,]>0))/ncol(notpop.matrix)
})
notpop.density<-notpop.df$frequency/sum(notpop.df$frequency)
notpop.df<-transform(notpop.df, density=notpop.density,occurrence=notpop.occurrence)
notpop.df <- subset(notpop.df, nchar(term) >= 2)
notpop.df<-subset(notpop.df, notpop.df$occurrence<0.18)
head(notpop.df[with(notpop.df, order(-occurrence)),], n = 50)

#검증 데이터로 분류기 검증

notpoptest.docs <- dir(notpoptest.path)

poptest.docs <- dir(poptest.path)

notpoptest.class <- suppressWarnings(lapply(notpoptest.docs,
  function(p)
  {
   pop.classifier(file.path(notpoptest.path, p))
  }))


poptest.class <- suppressWarnings(lapply(poptest.docs,
  function(p)
  {
    pop.classifier(file.path(poptest.path, p))
  }))

notpoptest.matrix <- do.call(rbind, notpoptest.class)
notpoptest.final <- cbind(notpoptest.matrix, "notpop")


poptest.matrix <- do.call(rbind, poptest.class)
poptest.final <- cbind(poptest.matrix, "pop")

class.matrix <- rbind(notpoptest.final, poptest.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.notpop" ,"Pr.pop", "Class", "Type")
class.df$Pr.pop <- as.numeric(class.df$Pr.pop)
class.df$Pr.notpop <- as.numeric(class.df$Pr.notpop)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

# 오류 들여다 보기

notpop.False<-subset(class.df, Type=="notpop" & Class=="FALSE")
notpop.FalseCount<-nrow(notpop.False)

notpop.True<-subset(class.df, Type=="notpop" & Class=="TRUE")
notpop.TrueCount<-nrow(notpop.True)

pop.False<-subset(class.df, Type=="pop" & Class=="FALSE")
pop.FalseCount<-nrow(pop.False)

pop.True<-subset(class.df, Type=="pop" & Class=="TRUE")
pop.TrueCount<-nrow(pop.True)
notpop.row <- c(notpop.FalseCount, notpop.TrueCount)

pop.row<-c(pop.FalseCount, pop.TrueCount)

allarticle<-rbind(notpop.row, pop.row)

colnames(allarticle) = c("False", "True")

# 그래프 그리기

class.plot <- ggplot(class.df, aes(x = log(Pr.pop), log(Pr.notpop))) +
    geom_point(aes(shape = Type, alpha = 0.5)) +
    geom_abline(intercept = 0, slope = 1) +
    scale_shape_manual(values = c("pop" = 2,
                                  "notpop" = 3),
                       name = "news Type") +
    scale_alpha(guide = "none") +
    xlab("log[Pr(pop)]") +
    ylab("log[Pr(notpop)]") +
    theme_bw() +
    theme(axis.text.x = element_blank(), axis.text.y = element_blank())
ggsave(plot = class.plot,
       filename = file.path("./", "newPopGraph.pdf"),
       height = 10,
       width = 10)
get.results <- function(bool.vector)
{
         results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
              length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
 return(results)
}

학습, 검증데이터가 많은 전체 R 코드


#파일 경로

pop.path<-"../colectedData/bigpop"
notpop.path<-"../colectedData/notpop"
poptest.path<-"../colectedData/bigpoptest"
notpoptest.path<-"../colectedData/notpoptest"

#라이브러리 설정

library(ggplot2)
library(tm)
library(KoNLP)

#함수들

get.msg<-function(path){
   con<-file(path, open="rt", encoding="UTF-8")
   text<-readLines(con)
   text<-noquote(text)
   text<-gsub(",","",text)
   text<-gsub("“","",text)
   text<-gsub("”","",text)
   text<-gsub("‘","",text)
   text<-gsub("’","",text)
   text<-gsub("·","",text)
   text<-gsub("ᄮix","",text)
   text<-gsub("곸쑝濡","",text)
   text<-gsub("있다”고","",text)
   text<-gsub("湲곗옄","",text)
   text<-gsub("寃껋쑝濡","",text)
   text<-gsub("寃껋씠","",text)
   Noun<-extractNoun(text)
   text<-c(Noun)
   close(con)
   return(paste(text,collapse="\n"))
}

get.tdm<-function(doc.vec){
   doc.corpus<-Corpus(VectorSource(doc.vec))
   control<-list(removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
   doc.dtm<-TermDocumentMatrix(doc.corpus,control)
   return(doc.dtm)
}
classify.email<-function(path, training.df, prior=0.5, c=1e-5){
   msg<-get.msg(path)
   msg.tdm<-get.tdm(msg)
   msg.matrix<-as.matrix(msg.tdm)
   msg.freq<-rowSums(as.matrix(msg.matrix))
   msg.match<-intersect(names(msg.freq),training.df$term)
   if(length(msg.match)<1){
      return (prior*c^(length(msg.freq)))
   }
   else{
      match.probs<-training.df$occurrence[match(msg.match, training.df$term)]
      return (prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))
   }
}
pop.classifier <-function(path){
  pr.pop<-classify.email(path,pop.df)
  pr.notpop<-classify.email(path, notpop.df)
  return(c(pr.notpop, pr.pop, ifelse(pr.pop > pr.notpop, 1, 0)))
}
#인기 뉴스 학습

pop.docs<-dir(pop.path)
all.pop<-sapply(pop.docs, function(p) get.msg(paste(pop.path,p,sep="/")))
pop.tdm<-get.tdm(all.pop)
pop.matrix<-as.matrix(pop.tdm)
pop.counts<-rowSums(pop.matrix)
pop.df<-data.frame(cbind(names(pop.counts),as.numeric(pop.counts)),stringsAsFactors=FALSE)
names(pop.df)<-c("term", "frequency")
pop.df$frequency<-as.numeric(pop.df$frequency)
pop.occurrence<-sapply(1:nrow(pop.matrix), function(i){
     length(which(pop.matrix[i,]>0))/ncol(pop.matrix)
})
pop.density<-pop.df$frequency/sum(pop.df$frequency)
pop.df<-transform(pop.df, density=pop.density,occurrence=pop.occurrence)
pop.df <- subset(pop.df, nchar(term) >= 2)
pop.df<-tail(pop.df, n = nrow(pop.df) - 50)
pop.df<-subset(pop.df, pop.df$occurrence<0.18)
head(pop.df[with(pop.df, order(-occurrence)),], n = 50)

#비인기 뉴스 학습

notpop.docs<-dir(notpop.path)
all.notpop<-sapply(notpop.docs, function(p) get.msg(paste(notpop.path,p,sep="/")))
notpop.tdm<-get.tdm(all.notpop)
notpop.matrix <- as.matrix(notpop.tdm)
notpop.counts<-rowSums(notpop.matrix)
notpop.df<-data.frame(cbind(names(notpop.counts),as.numeric(notpop.counts)),stringsAsFactors=FALSE)
names(notpop.df)<-c("term", "frequency")
notpop.df$frequency<-as.numeric(notpop.df$frequency)
notpop.occurrence<-sapply(1:nrow(notpop.matrix), function(i){
   length(which(notpop.matrix[i,]>0))/ncol(notpop.matrix)
})
notpop.density<-notpop.df$frequency/sum(notpop.df$frequency)
notpop.df<-transform(notpop.df, density=notpop.density,occurrence=notpop.occurrence)
notpop.df <- subset(notpop.df, nchar(term) >= 2)
notpop.df<-subset(notpop.df, notpop.df$occurrence<0.18)
head(notpop.df[with(notpop.df, order(-occurrence)),], n = 50)

#검증 데이터로 분류기 검증

notpoptest.docs <- dir(notpoptest.path)

poptest.docs <- dir(poptest.path)

notpoptest.class <- suppressWarnings(lapply(notpoptest.docs,
  function(p)
  {
   pop.classifier(file.path(notpoptest.path, p))
  }))


poptest.class <- suppressWarnings(lapply(poptest.docs,
  function(p)
  {
    pop.classifier(file.path(poptest.path, p))
  }))

notpoptest.matrix <- do.call(rbind, notpoptest.class)
notpoptest.final <- cbind(notpoptest.matrix, "notpop")


poptest.matrix <- do.call(rbind, poptest.class)
poptest.final <- cbind(poptest.matrix, "pop")

class.matrix <- rbind(notpoptest.final, poptest.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.notpop" ,"Pr.pop", "Class", "Type")
class.df$Pr.pop <- as.numeric(class.df$Pr.pop)
class.df$Pr.notpop <- as.numeric(class.df$Pr.notpop)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

# 오류 들여다 보기

notpop.False<-subset(class.df, Type=="notpop" & Class=="FALSE")
notpop.FalseCount<-nrow(notpop.False)

notpop.True<-subset(class.df, Type=="notpop" & Class=="TRUE")
notpop.TrueCount<-nrow(notpop.True)

pop.False<-subset(class.df, Type=="pop" & Class=="FALSE")
pop.FalseCount<-nrow(pop.False)

pop.True<-subset(class.df, Type=="pop" & Class=="TRUE")
pop.TrueCount<-nrow(pop.True)
notpop.row <- c(notpop.FalseCount, notpop.TrueCount)

pop.row<-c(pop.FalseCount, pop.TrueCount)

allarticle<-rbind(notpop.row, pop.row)

colnames(allarticle) = c("False", "True")

# 그래프 그리기

class.plot <- ggplot(class.df, aes(x = log(Pr.pop), log(Pr.notpop))) +
    geom_point(aes(shape = Type, alpha = 0.5)) +
    geom_abline(intercept = 0, slope = 1) +
    scale_shape_manual(values = c("pop" = 2,
                                  "notpop" = 3),
                       name = "news Type") +
    scale_alpha(guide = "none") +
    xlab("log[Pr(pop)]") +
    ylab("log[Pr(notpop)]") +
    theme_bw() +
    theme(axis.text.x = element_blank(), axis.text.y = element_blank())
ggsave(plot = class.plot,
       filename = file.path("./", "newPopGraph.pdf"),
       height = 10,
       width = 10)
get.results <- function(bool.vector)
{
         results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
              length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
 return(results)
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly