spamApplications

파일 경로 설정 라이브러리 설정

pop.path<-"../colectedData/pop"
notpop.path<-"../colectedData/notpop"
poptest.path<-"../colectedData/poptest"
notpoptest.path<-"../colectedData/notpoptest"

library(ggplot2)
library(tm)
library(KoNLP)

//install.packages('KoNLP') 설치가 필요한 경우만 실행
//install.packages('tm')

특히 KoNLP설치하는데 문제가 있다면 R을 관리자 권한(sudo)로 실행해서 설치해볼것!!

데이터 학습시키기

get.msg<-function(path){
   con<-file(path, open="rt", encoding="UTF-8")
   text<-readLines(con)
   text<-noquote(text)
   text<-gsub(",","",text)
   text<-gsub("“","",text)
   text<-gsub("”","",text)
   text<-gsub("‘","",text)
   text<-gsub("’","",text)
   text<-gsub("·","",text)
   text<-gsub("ᄮix","",text)
   text<-gsub("곸쑝濡","",text)
   text<-gsub("있다”고","",text)
   text<-gsub("湲곗옄","",text)
   text<-gsub("寃껋쑝濡","",text)
   text<-gsub("寃껋씠","",text)
   Noun<-extractNoun(text)
   text<-c(Noun)
   close(con)
   return(paste(text,collapse="\n"))
}

get.tdm<-function(doc.vec){
   doc.corpus<-Corpus(VectorSource(doc.vec))
   control<-list(removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
   doc.dtm<-TermDocumentMatrix(doc.corpus,control)
   return(doc.dtm)
}

get.msg함수는 문서를 열어서 조사와 따옴표 쉽표 따위를 모두 제거해주는 함수 입니다. get.tdm함수는 get.msg함수가 뽑아온 단어들을 가지고 새로운 테이블을 만듭니다.

인기있는 뉴스 베이즈 분류 학습시키기

pop.docs<-dir(pop.path)
all.pop<-sapply(pop.docs, function(p) get.msg(paste(pop.path,p,sep="/")))
pop.tdm<-get.tdm(all.pop)
pop.matrix<-as.matrix(pop.tdm)
pop.counts<-rowSums(pop.matrix)
pop.df<-data.frame(cbind(names(pop.counts),as.numeric(pop.counts)),stringsAsFactors=FALSE)
names(pop.df)<-c("term", "frequency")
pop.df$frequency<-as.numeric(pop.df$frequency)
pop.occurrence<-sapply(1:nrow(pop.matrix), function(i){
     length(which(pop.matrix[i,]>0))/ncol(pop.matrix)
})
pop.density<-pop.df$frequency/sum(pop.df$frequency)
pop.df<-transform(pop.df, density=pop.density,occurrence=pop.occurrence)
pop.df <- subset(pop.df, nchar(term) >= 3)
pop.df<-tail(pop.df, n = nrow(pop.df) - 50)
pop.df<-subset(pop.df, pop.df$occurrence<0.05)
head(pop.df[with(pop.df, order(-occurrence)),], n = 50)

세 글자 이상 추출

cutpopoccurrence, word>=3

인기 없는 뉴스 베이즈 분류 학습시키기

notpop.docs<-dir(notpop.path)
all.notpop<-sapply(notpop.docs, function(p) get.msg(paste(notpop.path,p,sep="/")))
notpop.tdm<-get.tdm(all.notpop)
notpop.matrix <- as.matrix(notpop.tdm)
notpop.counts<-rowSums(notpop.matrix)
notpop.df<-data.frame(cbind(names(notpop.counts),as.numeric(notpop.counts)),stringsAsFactors=FALSE)
names(notpop.df)<-c("term", "frequency")
notpop.df$frequency<-as.numeric(notpop.df$frequency)
notpop.occurrence<-sapply(1:nrow(notpop.matrix), function(i){
   length(which(notpop.matrix[i,]>0))/ncol(notpop.matrix)
})
notpop.density<-notpop.df$frequency/sum(notpop.df$frequency)
notpop.df<-transform(notpop.df, density=notpop.density,occurrence=notpop.occurrence)
notpop.df <- subset(notpop.df, nchar(term) >= 3)
notpop.df<-subset(notpop.df, notpop.df$occurrence<0.05)
head(notpop.df[with(notpop.df, order(-occurrence)),], n = 50)

세 글자 이상 추출

cutnotpopoccurence

검증하기

classify.email<-function(path, training.df, prior=0.5, c=1e-6){
   msg<-get.msg(path)
   msg.tdm<-get.tdm(msg)
   msg.matrix<-as.matrix(msg.tdm)
   msg.freq<-rowSums(as.matrix(msg.matrix))
   msg.match<-intersect(names(msg.freq),training.df$term)
   if(length(msg.match)<1){
      return (prior*c^(length(msg.freq)))
   }
   else{
      match.probs<-training.df$occurrence[match(msg.match, training.df$term)]
      return (prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match)))
   }
}

검증용 인기글에 분류기 실행해보기

poptest.docs<-dir(poptest.path)
poptest.poptest<-sapply(poptest.docs, function(p) classify.email(paste(poptest.path, p, sep="/"),training.df=pop.df))
poptest.notpoptest<-sapply(poptest.docs, function(p) classify.email(paste(poptest.path,p,sep="/"),training.df=notpop.df))
pop.res<-ifelse(poptest.poptest > poptest.notpoptest,TRUE,FALSE)
summary(pop.res)

pop.resword3

검증용 인기 없는글에 분류기 실행해보기

notpoptest.docs<-dir(notpoptest.path)
notpoptest.poptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path, p, sep="/"),training.df=pop.df))
notpoptest.notpoptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path,p,sep="/"),training.df=notpop.df))
notpop.res<-ifelse(notpoptest.poptest > notpoptest.notpoptest,TRUE,FALSE)
summary(notpop.res)

notpopresword3

분류기가 검증데이터의 분류를 제대로 하지 못하고 있습니다. 그 원인을 생각해 보면

분류기에 문제가 있다.
실제 인기있는 뉴스와 인기 없는 뉴스에서 단어의 차이는 없다.
데이터를 모두 인기 있는 뉴스만 모았거나 모두 인기 없는 뉴스만 모았다. (데이터를 잘 못 수집했다.)

수식의 오류

상수 c의 문제

prior*c^(length(msg.freq))는 매칭 되는 단어가 하나도 없는 경우 계산되는 베이즈 확률값이 0이되지 않기 위해 상수 c에 msg 테이블의 행의 수, 학습한 단어의 종류 수를 제곱하여 0보다 큰 아주 작은 실수값을 가지도록 합니다. 그러나 실제 R에서는 1e-323보다 작은 수는 모두 0으로 만들어 버립니다. 예를 들어서, 학습한 단어의 종류는 모두 432가지 일때, 어떤 문서에서 학습한 단어들이 한번도 출현 하지 않았으면 prior*c^(432) = 0이되고 그 문서에서 학습한 단어들이 80종류 정도 출현 했다고 해도 prior*prod(match.probs)*c^(432-80) = 0이 되버립니다.

floating problem

prod(match.probs)의 문제

prior*prod(match.probs)*c^(length(msg.freq)-length(msg.match))에서 prod(match.probs)는 검증 문서에 출현한 단어의 occurrence들을 곱합니다. 그런데 문제는 occurrence는 0~1사이 값이고 곱할수록 그 값은 알 수 없어집니다. 예를 들어 0.5*0.5*0.3의 계산 과정을 보면 0.5*0.5 = 0.25 여기서 값이 줄었지만 여기서 0.25*0.03 = 0.75로 값이 커졌습니다. 그리고 앞서 상수 c에서 봤듯이 소수점을 324개 이상 계속 늘리기만 하면 0으로 만들어 버리는 문제가 있기 때문에 occurrence를 계속 곱하기만 한다면 확률 계산 결과 소수점이 늘어나서 결과값이 0이 될 가능성이 있습니다.

문제점 해결 방안

classify.email<-function(path, training.df, prior=0.5, c=1e-1){
   msg<-get.msg(path)
   msg.tdm<-get.tdm(msg)
   msg.freq<-rowSums(as.matrix(msg.tdm))
   msg.match<-intersect(names(msg.freq),training.df$term)
   if(length(msg.match)<1){
      return (prior*c^(length(msg.freq)))
   }
   else{
      match.probs<-training.df$occurrence[match(msg.match, training.df$term)]
      return (prior*sum(match.probs)*c^(length(msg.freq)-length(msg.match)))
   }
}

이와 같이 상수 c = 1e-1로 변경하고 prod(match.probs)는 sum(match.probs)로 변경합니다.

학습한 데이터로 분류기 실행

학습한 인기 있는글

pop.docs<-dir(pop.path)
pop.poptest<-sapply(pop.docs, function(p) classify.email(paste(pop.path, p, sep="/"),training.df=pop.df))
pop.notpoptest<-sapply(pop.docs, function(p) classify.email(paste(pop.path,p,sep="/"),training.df=notpop.df))
pop.res<-ifelse(pop.poptest > pop.notpoptest,TRUE,FALSE)
summary(pop.res)

selfPop

학습한 인기 없는 글

notpop.docs<-dir(notpop.path)
notpop.poptest<-sapply(notpop.docs, function(p) classify.email(paste(notpop.path, p, sep="/"),training.df=pop.df))
notpop.notpoptest<-sapply(notpop.docs, function(p) classify.email(paste(notpop.path,p,sep="/"),training.df=notpop.df))
notpop.res<-ifelse(notpop.poptest > notpop.notpoptest,TRUE,FALSE)
summary(notpop.res)

selfnotPop

자기가 학습한 데이터는 제대로 분류를 해줍니다.

다시 검증하기

다시 검증용 인기글에 분류기 실행해 보기

poptest.docs<-dir(poptest.path)
poptest.poptest<-sapply(poptest.docs, function(p) classify.email(paste(poptest.path, p, sep="/"),training.df=pop.df))
poptest.notpoptest<-sapply(poptest.docs, function(p) classify.email(paste(poptest.path,p,sep="/"),training.df=notpop.df))
pop.res<-ifelse(poptest.poptest > poptest.notpoptest,TRUE,FALSE)
summary(pop.res)

retestpop

다시 검증용 인기 없는글에 분류기 실행해보기

notpoptest.docs<-dir(notpoptest.path)
notpoptest.poptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path, p, sep="/"),training.df=pop.df))
notpoptest.notpoptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path,p,sep="/"),training.df=notpop.df))
notpop.res<-ifelse(notpoptest.poptest > notpoptest.notpoptest,TRUE,FALSE)
summary(notpop.res)

retestnotpop

검증용 인기 없는 뉴스는 분류가 제대로 되지 않았습니다.

데이터 다시 수집

이전에 모았던 인기없는 뉴스보다 더 인기가 없는 뉴스를 수집하여 분류기를 학습시킵니다.

notpop.docs<-dir(notpop.path)
all.notpop<-sapply(notpop.docs, function(p) get.msg(paste(notpop.path,p,sep="/")))
notpop.tdm<-get.tdm(all.notpop)
notpop.matrix <- as.matrix(notpop.tdm)
notpop.counts<-rowSums(notpop.matrix)
notpop.df<-data.frame(cbind(names(notpop.counts),as.numeric(notpop.counts)),stringsAsFactors=FALSE)
names(notpop.df)<-c("term", "frequency")
notpop.df$frequency<-as.numeric(notpop.df$frequency)
notpop.occurrence<-sapply(1:nrow(notpop.matrix), function(i){
   length(which(notpop.matrix[i,]>0))/ncol(notpop.matrix)
})
notpop.density<-notpop.df$frequency/sum(notpop.df$frequency)
notpop.df<-transform(notpop.df, density=notpop.density,occurrence=notpop.occurrence)
notpop.df <- subset(notpop.df, nchar(term) >= 3)
head(notpop.df[with(notpop.df, order(-occurrence)),], n = 50)

새 데이터로 인기 뉴스 다시 검증

poptest.docs<-dir(poptest.path)
poptest.poptest<-sapply(poptest.docs, function(p) classify.email(paste(poptest.path, p, sep="/"),training.df=pop.df))
poptest.notpoptest<-sapply(poptest.docs, function(p) classify.email(paste(poptest.path,p,sep="/"),training.df=notpop.df))
pop.res<-ifelse(poptest.poptest > poptest.notpoptest,TRUE,FALSE)
summary(pop.res)

newDataTestpop

새 데이터로 인기 없는 뉴스 다시 검증

notpoptest.docs<-dir(notpoptest.path)
notpoptest.poptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path, p, sep="/"),training.df=pop.df))
notpoptest.notpoptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path,p,sep="/"),training.df=notpop.df))
notpop.res<-ifelse(notpoptest.poptest > notpoptest.notpoptest,TRUE,FALSE)
summary(notpop.res)

newDataTestnotpop

검증용 인기 뉴스, 인기 없는 뉴스에 분류기 검증

pop.classifier <-function(path){
  pr.pop<-classify.email(path,pop.df)
  pr.notpop<-classify.email(path, notpop.df)
  return(c(pr.notpop, pr.pop, ifelse(pr.pop > pr.notpop, 1, 0)))
}

notpoptest.docs <- dir(notpoptest.path)

poptest.docs <- dir(poptest.path)

notpoptest.class <- suppressWarnings(lapply(notpoptest.docs,
  function(p)
  {
   pop.classifier(file.path(notpoptest.path, p))
  }))


poptest.class <- suppressWarnings(lapply(poptest.docs,
  function(p)
  {
    pop.classifier(file.path(poptest.path, p))
  }))

notpoptest.matrix <- do.call(rbind, notpoptest.class)
notpoptest.final <- cbind(notpoptest.matrix, "notpop")


poptest.matrix <- do.call(rbind, poptest.class)
poptest.final <- cbind(poptest.matrix, "pop")

class.matrix <- rbind(notpoptest.final, poptest.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.notpop" ,"Pr.pop", "Class", "Type")
class.df$Pr.pop <- as.numeric(class.df$Pr.pop)
class.df$Pr.notpop <- as.numeric(class.df$Pr.notpop)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

뉴스 종류별 긍정오류, 부정오류 계산하기

notpop.False<-subset(class.df, Type=="notpop" & Class=="FALSE")
notpop.FalseCount<-nrow(notpop.False)

notpop.True<-subset(class.df, Type=="notpop" & Class=="TRUE")
notpop.TrueCount<-nrow(notpop.True)

pop.False<-subset(class.df, Type=="pop" & Class=="FALSE")
pop.FalseCount<-nrow(pop.False)

pop.True<-subset(class.df, Type=="pop" & Class=="TRUE")
pop.TrueCount<-nrow(pop.True)

notpop.row <- c(notpop.FalseCount, notpop.TrueCount)

pop.row<-c(pop.FalseCount, pop.TrueCount)

allarticle<-rbind(notpop.row, pop.row)

colnames(allarticle) = c("False", "True")

allArticlefalsetrue

뉴스종류	False	True
인기없는 뉴스	0.696	0.303
인기있는 뉴스	0.301	0.698

그래프

class.plot <- ggplot(class.df, aes(x = log(Pr.pop), log(Pr.notpop))) +
    geom_point(aes(shape = Type, alpha = 0.5)) +
    geom_abline(intercept = 0, slope = 1) +
    scale_shape_manual(values = c("pop" = 2,
                                  "notpop" = 3),
                       name = "news Type") +
    scale_alpha(guide = "none") +
    xlab("log[Pr(pop)]") +
    ylab("log[Pr(notpop)]") +
    theme_bw() +
    theme(axis.text.x = element_blank(), axis.text.y = element_blank())
ggsave(plot = class.plot,
       filename = file.path("./", "newPopGraph.pdf"),
       height = 10,
       width = 10)
get.results <- function(bool.vector)
{
         results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
              length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
 return(results)
}

graph

Provide feedback

Saved searches

Use saved searches to filter your results more quickly