Skip to content

tdm만들기 테스트 코드

seokwon edited this page Nov 9, 2018 · 9 revisions

초기 설정

testPop.path<-"../poptest"
library('tm')
library('KoNLP')

함수

get.msg<-function(path){
   con<-file(path, open="rt", encoding="UTF-8")
   print("--------------------------------------------------------------------------------Start Con------------------------------------------------------------------------------")
   print(head(con))
   text<-readLines(con)
   text<-noquote(text)
   text<-gsub(",","",text)
   text<-gsub("“","",text)
   text<-gsub("”","",text)
   text<-gsub("‘","",text)
   text<-gsub("’","",text)
   text<-gsub("·","",text)
   Noun<-extractNoun(text) //조사 제거
   text<-c(Noun)
   close(con)
   return(paste(text,collapse="\n"))
}
test.docs<-dir(testPop.path)
all.test<-sapply(test.docs, function(p) get.msg(paste(testPop.path,p,sep="/")))

 get.tdm<-function(doc.vec){
    doc.corpus<-Corpus(VectorSource(doc.vec))
    control<-list(removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
    doc.dtm<-TermDocumentMatrix(doc.corpus,control)
    return(doc.dtm)
 }

test.tdm<-get.tdm(all.test)

test.matrix <- as.matrix(test.tdm)
print(test.matrix)

pop.docs<-dir(pop.path) pop.poptest<-sapply(pop.docs, function(p) classify.email(paste(pop.path, p, sep="/"),training.df=pop.df)) pop.notpoptest<-sapply(pop.docs, function(p) classify.email(paste(pop.path,p,sep="/"),training.df=notpop.df)) pop.res<-ifelse(pop.poptest > pop.notpoptest,TRUE,FALSE) summary(pop.res)

notpop.docs<-dir(notpop.path) notpop.poptest<-sapply(notpop.docs, function(p) classify.email(paste(notpop.path, p, sep="/"),training.df=pop.df)) notpop.notpoptest<-sapply(notpop.docs, function(p) classify.email(paste(notpop.path,p,sep="/"),training.df=notpop.df)) notpop.res<-ifelse(notpop.poptest > notpop.notpoptest,TRUE,FALSE) summary(notpop.res)

print(head(names(msg.freq))) classifypop.email<-function(path, trpop.df, trnotpop.df, priorpop=0.5,priornotpop=0.5, c=1e-2){ msg<-get.msg(path) msg.tdm<-get.tdm(msg) msg.freq<-rowSums(as.matrix(msg.tdm))

msg.popmatch<-intersect(names(msg.freq),trpop.df$term) msg.notpopmatch<-intersect(names(msg.freq),trnotpop.df$term) print(length(msg.freq)) if(length(msg.popmatch)<1 | (length(msg.notpopmatch)<1)){ print(msg.freq) return (priorpopc^(length(msg.freq))) } else{ match.popprobs<-trpop.df$occurrence[match(msg.popmatch, trpop.df$term)] match.notpopprobs<-trnotpop.df$occurrence[match(msg.notpopmatch, trnotpop.df$term)] molecule<-priorpopprod(match.popprobs) denominator<-priorpopprod(match.popprobs)+priornotpopprod(match.notpopprobs) print("pop prior") print(molecule/denominator) return (molecule/denominator) } } classifynotpop.email<-function(path, trpop.df, trnotpop.df, priorpop=0.5,priornotpop=0.5, c=1e-2){ msg<-get.msg(path) msg.tdm<-get.tdm(msg) msg.freq<-rowSums(as.matrix(msg.tdm))

msg.popmatch<-intersect(names(msg.freq),trpop.df$term) msg.notpopmatch<-intersect(names(msg.freq),trnotpop.df$term) print(length(msg.freq)) if(length(msg.popmatch)<1 | (length(msg.notpopmatch)<1)){ print(msg.freq) return (priornotpopc^(length(msg.freq))) } else{ match.popprobs<-trpop.df$occurrence[match(msg.popmatch, trpop.df$term)] match.notpopprobs<-trnotpop.df$occurrence[match(msg.notpopmatch, trnotpop.df$term)] molecule<-priornotpopprod(match.notpopprobs) denominator<-priorpopprod(match.popprobs)+priornotpopprod(match.notpopprobs) print("notpop prior") print(molecule/denominator) return (molecule/denominator) } }

poptest.docs<-dir(poptest.path)

poptest.poptest<-sapply(poptest.docs, function(p) classifypop.email(paste(poptest.path, p, sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) poptest.notpoptest<-sapply(poptest.docs, function(p) classifynotpop.email(paste(poptest.path,p,sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) pop.res<-ifelse(poptest.poptest > poptest.notpoptest,TRUE,FALSE) summary(pop.res)

notpoptest.docs<-dir(notpoptest.path) notpoptest.poptest<-sapply(notpoptest.docs, function(p) classifypop.email(paste(notpoptest.path, p, sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) notpoptest.notpoptest<-sapply(notpoptest.docs, function(p) classifynotpop.email(paste(notpoptest.path,p,sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) notpop.res<-ifelse(notpoptest.poptest > notpoptest.notpoptest,TRUE,FALSE) summary(notpop.res)

pop.classifier <-function(path){ pr.pop<-classifypop.email(path,pop.df,notpop.df, priorpop=0.5, priornotpop=0.5) pr.notpop<-classifynotpop.email(path, pop.df, notpop.df, priorpop=0.5, priornotpop=0.5) return(c(pr.pop, pr.notpop, ifelse(pr.pop > pr.notpop, 1, 0))) }

notpoptest.docs <- dir(notpoptest.path)

poptest.docs <- dir(poptest.path)

notpoptest.class <- suppressWarnings(lapply(notpoptest.docs, function(p) { pop.classifier(file.path(notpoptest.path, p)) }))

poptest.class <- suppressWarnings(lapply(poptest.docs, function(p) { pop.classifier(file.path(poptest.path, p)) }))

notpoptest.matrix <- do.call(rbind, notpoptest.class) notpoptest.final <- cbind(notpoptest.matrix, "notpop")

poptest.matrix <- do.call(rbind, poptest.class) poptest.final <- cbind(poptest.matrix, "pop")

class.matrix <- rbind(notpoptest.final, poptest.final) class.df <- data.frame(class.matrix, stringsAsFactors = FALSE) names(class.df) <- c("Pr.notpop" ,"Pr.pop", "Class", "Type") class.df$Pr.pop <- as.numeric(class.df$Pr.pop) class.df$Pr.notpop <- as.numeric(class.df$Pr.notpop) class.df$Class <- as.logical(as.numeric(class.df$Class)) class.df$Type <- as.factor(class.df$Type)

poptest.docs<-dir(poptest.path) poptest.poptest<-sapply(poptest.docs, function(p) classifypop.email(paste(poptest.path, p, sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) poptest.notpoptest<-sapply(poptest.docs, function(p) classifynotpop.email(paste(poptest.path,p,sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) pop.res<-ifelse(poptest.poptest > poptest.notpoptest,TRUE,FALSE) summary(pop.res)

pop.docs<-dir(pop.path) pop.docs<-pop.docs[which(pop.docs!="cmds")] all.pop<-sapply(pop.docs, function(p) get.msg(paste(pop.path,p,sep="/"))) pop.tdm<-get.tdm(all.pop) pop.counts<-rowSums(pop.matrix) pop.df<-data.frame(cbind(names(pop.counts),as.numeric(pop.counts)),stringsAsFactors=FALSE) names(pop.df)<-c("term", "frequency") pop.df$frequency<-as.numeric(pop.df$frequency) pop.occurrence<-sapply(1:nrow(pop.matrix), function(i){ length(which(pop.matrix[i,]>0))/ncol(pop.matrix) }) pop.density<-pop.df$frequency/sum(pop.df$frequency) pop.df<-transform(pop.df, density=pop.density,occurrence=pop.occurrence) pop.df <- subset(pop.df, nchar(term) >= 2) pop.df<-tail(pop.df, n = nrow(pop.df) - 50) pop.df<-subset(pop.df, pop.df$occurrence<0.05) head(pop.df[with(pop.df, order(-occurrence)),], n = 50)

notpop.docs<-dir(notpop.path) notpop.docs<-notpop.docs[which(notpop.docs!="cmds")] all.notpop<-sapply(notpop.docs, function(p) get.msg(paste(notpop.path,p,sep="/"))) notpop.tdm<-get.tdm(all.notpop) notpop.matrix <- as.matrix(notpop.tdm) notpop.counts<-rowSums(notpop.matrix) notpop.df<-data.frame(cbind(names(notpop.counts),as.numeric(notpop.counts)),stringsAsFactors=FALSE) names(notpop.df)<-c("term", "frequency") notpop.df$frequency<-as.numeric(notpop.df$frequency) notpop.occurrence<-sapply(1:nrow(notpop.matrix), function(i){ length(which(notpop.matrix[i,]>0))/ncol(notpop.matrix) }) notpop.density<-notpop.df$frequency/sum(notpop.df$frequency) notpop.df<-transform(notpop.df, density=notpop.density,occurrence=notpop.occurrence) notpop.df <- subset(notpop.df, nchar(term) >= 3) notpop.df<-subset(notpop.df, notpop.df$occurrence<0.05) head(notpop.df[with(notpop.df, order(-occurrence)),], n = 50)

poptest.docs<-dir(poptest.path) poptest.docs<-poptest.docs[which(poptest.docs!="cmds")] poptest.poptest<-sapply(poptest.docs, function(p) classifypop.email(paste(poptest.path, p, sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) poptest.notpoptest<-sapply(poptest.docs, function(p) classifynotpop.email(paste(poptest.path,p,sep="/"),trpop.df=pop.df,trnotpop.df=notpop.df)) pop.res<-ifelse(poptest.poptest > poptest.notpoptest,TRUE,FALSE) summary(pop.res)

notpoptest.docs<-dir(notpoptest.path) notpoptest.docs<-notpoptest.docs[which(notpoptest.docs!="cmds")] notpoptest.poptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path, p, sep="/"),training.df=pop.df)) notpoptest.notpoptest<-sapply(notpoptest.docs, function(p) classify.email(paste(notpoptest.path,p,sep="/"),training.df=notpop.df)) notpop.res<-ifelse(notpoptest.poptest > notpoptest.notpoptest,TRUE,FALSE) summary(notpop.res)

pop.classifier <-function(path){ pr.pop<-classify.email(path,pop.df, prior=0.2) pr.notpop<-classify.email(path, notpop.df, prior=0.8) return(c(pr.notpop, pr.pop, ifelse(pr.pop > pr.notpop, 1, 0))) }

notpoptest.docs <- dir(notpoptest.path)

poptest.docs <- dir(poptest.path)

notpoptest.class <- suppressWarnings(lapply(notpoptest.docs, function(p) { pop.classifier(file.path(notpoptest.path, p)) }))

poptest.class <- suppressWarnings(lapply(poptest.docs, function(p) { pop.classifier(file.path(poptest.path, p)) }))

notpoptest.matrix <- do.call(rbind, notpoptest.class) notpoptest.final <- cbind(notpoptest.matrix, "notpop")

poptest.matrix <- do.call(rbind, poptest.class) poptest.final <- cbind(poptest.matrix, "pop")

class.matrix <- rbind(notpoptest.final, poptest.final) class.df <- data.frame(class.matrix, stringsAsFactors = FALSE) names(class.df) <- c("Pr.notpop" ,"Pr.pop", "Class", "Type") class.df$Pr.pop <- as.numeric(class.df$Pr.pop) class.df$Pr.notpop <- as.numeric(class.df$Pr.notpop) class.df$Class <- as.logical(as.numeric(class.df$Class)) class.df$Type <- as.factor(class.df$Type)

notpop.False<-subset(class.df, Type=="notpop" & Class=="FALSE") notpop.FalseCount<-nrow(notpop.False)

notpop.True<-subset(class.df, Type=="notpop" & Class=="TRUE") notpop.TrueCount<-nrow(notpop.True)

pop.False<-subset(class.df, Type=="pop" & Class=="FALSE") pop.FalseCount<-nrow(pop.False)

pop.True<-subset(class.df, Type=="pop" & Class=="TRUE") pop.TrueCount<-nrow(pop.True) notpop.row <- c(notpop.FalseCount, notpop.TrueCount)

pop.row<-c(pop.FalseCount, pop.TrueCount)

allarticle<-rbind(notpop.row, pop.row)

colnames(allarticle) = c("False", "True")

pop.docs<-dir(pop.path) all.pop<-sapply(pop.docs, function(p) get.msg(paste(pop.path,p,sep="/"))) pop.tdm<-get.tdm(all.pop) pop.matrix<-as.matrix(pop.tdm) pop.counts<-rowSums(pop.matrix) pop.df<-data.frame(cbind(names(pop.counts),as.numeric(pop.counts)),stringsAsFactors=FALSE) names(pop.df)<-c("term", "frequency") pop.df$frequency<-as.numeric(pop.df$frequency) pop.occurrence<-sapply(1:nrow(pop.matrix), function(i){ length(which(pop.matrix[i,]>0))/ncol(pop.matrix) }) pop.density<-pop.df$frequency/sum(pop.df$frequency) pop.df<-transform(pop.df, density=pop.density,occurrence=pop.occurrence) head(pop.df[with(pop.df, order(-occurrence)),], n = 50)