/
dyn_ha_12287.R
163 lines (152 loc) · 4.75 KB
/
dyn_ha_12287.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#12287.dyn_ha: extract multiLX
#20220715(18.43)
#essai to extract multilingual and neologist expressions from german texts
################
library(readtext)
library(purrr)
library(httr)
library(jsonlite)
library(clipr)
library(stringi)
library(stringr)
library(udpipe)
setwd("~/boxHKW/21S/DH/gith/DH_essais/sections/DYN")
getwd()
githbase<-"https://github.com/esteeschwarz/"
githtreemain<-"tree/main/"
githtreeraw<-"raw/main/"
#sections/DYN/data/dyn_ha_wolf_out01/wordlist.csv
githcombine<-paste0(githbase,"\\2",githtreeraw,"\\3")
wdlocal<-gsub("(.+gith/)(.+?/)(.+?)",replacement =githcombine,getwd(),perl = T)
githurl<-"https://github.com/esteeschwarz/DH_essais/raw/main/sections/DYN"
githurl<-wdlocal
dudenurl<- 'https://api.duden.de/v1/spellcheck'
#githurl<-paste0(githbase,"DH_essais",githtree,)
wolfsrc<-paste0(githurl,"/data/dyn_ha_wolf_out01/wordlist.csv")
wolfcsv<-read.csv2(wolfsrc,sep = ",")
datadir<-paste0(getwd(),"/data/")
wolfouttxt<-paste0(datadir,"wolfout.txt")
wolftypes<-wolfcsv[,1]
write_clip(wolfcsv[,1])
wolfbody<-read_clip()
wolfbodyflat01<-stri_flatten(wolfbody[300:500],collapse=", ")
wolfbodyflat02<-stri_flatten(wolfbody[501:1000],collapse=" ")
wolfbodyflat03<-stri_flatten(wolfbody[1001:length(wolfbody)],collapse=" ")
#wolfbodyflat04<-stri_flatten(wolfbody[1501:2000],collapse=" ")
wolfbodyflat01
#dudenapi
head<-read_json("~/boxHKW/UNI/21S/DH/local/DYN/dudenapi_head.json")
head$`x-api-key`
req_body<-paste0('{"text":"',wolfbodyflat01,'","dictionary":["Wäl"]}')
cat(req_body)
req <- httr::POST(dudenurl,
httr::add_headers(
"x-api-key" = head$`x-api-key`,
"Content-Type" = "application/json"
),
body = req_body
);
head$`x-api-key`
#Extract the access token
httr::stop_for_status(req)
json<-httr::content(req, as="parsed")
#out<-fromJSON(json)
#out
#match(json$data$spellAdvices[[3]],"2")
lj<-length(json$data$spellAdvices)
for (k in 1:lj){
ec<-json$data$spellAdvices[[k]]$errorCode
ifelse(ec!=2,em<-c(k,ec,json$data$spellAdvices[[k]]$originalError),em<-F)
print(em)
}
print(json$data$spellAdvices[[107]]$originalError)
#### duden knowledge reicht nicht, gives false positives/negatives
#### e.g. with excluding all positives with errorcode 2 = NP without capitals:
#### duden dont finds "gotenhoden" which has just errorcode 2 and is not recognized
#### as unknown orthographie
####
ud_model <- udpipe_download_model(language = "german")
data(brussels_reviews)
comments <- subset(brussels_reviews, language %in% "es")
ud_model <- udpipe_load_model(ud_model$file_model)
x <- udpipe_annotate(ud_model, x = wolftypes)
x <- as.data.frame(x)
### also not sufficient, lots of misdefined words in general
#####
#try ("https://www.semanticsoftware.info/durm-german-lemmatizer")
src<-"https://www.semanticsoftware.info/system/files/delexicon.txt"
delemma<-read.delim(src)
m<-match(wolftypes,delemma$X.)
m<-match(delemma$X.,wolftypes)
mnot<-is.na(m)
wolftypes[mnot==F]
wolftypes[4]
m<-matrix(1:length(wolftypes))
delemma$X.[mnot==F]
for (k in 1:length(wolftypes)){
m[k,1]<-match(wolftypes[k],delemma$X.)
}
mpos<-subset(m$k,m$k!=NA)
match("siehst",delemma$X.)
delemma$X.[83658]
#### not to use: lexikon may include enough entries, but only the lemmas
#### next: "https://weblicht.sfs.uni-tuebingen.de/weblicht/"
getwd()
library(xml2)
data<-read_xml("data/chainer_result.xml")
xml_name(dta)
data %>% xml_ns()
data%>% xml_ns_strip()
data %>%
xml_find_all('//*') %>%
xml_path()
?xml_path
# Alle xpath-Pfade zu einem head-Element anzeigen
xml_find_all(pos_data, '/TEI/text/body/div[1]/head[1]/s/w[@lemma = "erst"]')
lemmas<-xml_find_all(data, '/*/*[2]/tc:lemmas')
xml_contents(head(lemmas))=="auf"
m<-grepl("unknown",xml_text(head(xml_children(lemmas))))
xml_text(head(xml_children(lemmas)))==regex(".*")
xml_find_all(lemmas,"tc:lemmas[[2]]")
all_heads <- data %>%
xml_find_all('/*/*[2]/tc:lemmas') %>%
xml_path()
all_heads
all_heads[1:10]
lemmas
all_chapters <- data %>%
xml_find_all('//tc:lemmas[') %>%
xml_text()
all_chapters
xml_find_all(data,'//tc:lemmas/tc:lemma="<unknown>"')
dwdsapi<-"https://www.dwds.de/api/wb/snippet/?q="
####
#pdf extract text for types/tokens
install.packages("pdftools")
library(pdftools)
src<-"~/boxHKW/21S/AVL/AVLgk_wolf_kochanie.pdf"
src<-"~/boxHKW/21S/AVL/AVLgk_wolf_lengevitch.pdf"
src<-"data/samplepdf.pdf"
src<-"data/wolfout_mod.txt"
####
if (extracted==1){textpdf<-pdf_text(file$datapath)}
if (extracted==2){texttxt<-readtext(file$datapath)$text}
d1<-textpdf
d2<-stri_extract_all_words(d1)
d3<-unique(d2[[1]])
e1<-texttxt
e2<-stri_extract_all_words(e1)
e3<-unique(e2[[1]])
####
d<-pdf_text(src)
d<-readtext(src)
d1<-stri_extract_all_words(d)
d2<-unique(d1[[1]])
d2
length(d1[[1]])
src
e<-pdf_ocr_text(src)
install.packages("tesseract")
library(tesseract)
e
# not satisfying.