/
CrawlingConfig.java
149 lines (120 loc) · 5.45 KB
/
CrawlingConfig.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.es.config.exentity;
import java.util.Map;
import java.util.function.Supplier;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.client.ftp.FtpClient;
import org.codelibs.fess.crawler.client.http.HcHttpClient;
import org.codelibs.fess.crawler.client.smb.SmbClient;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
public interface CrawlingConfig {
String getId();
String getName();
String[] getPermissions();
String[] getVirtualHosts();
String getDocumentBoost();
String getIndexingTarget(String input);
String getConfigId();
Integer getTimeToLive();
CrawlerClientFactory initializeClientFactory(Supplier<CrawlerClientFactory> creator);
Map<String, String> getConfigParameterMap(ConfigName name);
default void initializeDefaultHttpProxy(final Map<String, Object> paramMap) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final String proxyHost = fessConfig.getHttpProxyHost();
final String proxyPort = fessConfig.getHttpProxyPort();
if (StringUtil.isNotBlank(proxyHost) && StringUtil.isNotBlank(proxyPort)) {
paramMap.put(Param.Client.PROXY_HOST, proxyHost);
paramMap.put(Param.Client.PROXY_PORT, proxyPort);
final String proxyUsername = fessConfig.getHttpProxyUsername();
final String proxyPassword = fessConfig.getHttpProxyPassword();
if (proxyUsername != null && proxyPassword != null) {
paramMap.put(HcHttpClient.PROXY_CREDENTIALS_PROPERTY, new UsernamePasswordCredentials(proxyUsername, proxyPassword));
}
}
}
default String getScriptType() {
final String scriptType = getConfigParameterMap(ConfigName.CONFIG).get(Param.Config.SCRIPT_TYPE);
if (StringUtil.isNotBlank(scriptType)) {
return scriptType;
}
return Constants.DEFAULT_SCRIPT;
}
public enum ConfigType {
WEB("W"), FILE("F"), DATA("D");
private final String typePrefix;
ConfigType(final String typePrefix) {
this.typePrefix = typePrefix;
}
public String getTypePrefix() {
return typePrefix;
}
String getConfigId(final String id) {
if (id == null) {
return null;
}
return typePrefix + id.toString();
}
}
public enum ConfigName {
CLIENT, XPATH, META, VALUE, SCRIPT, FIELD, CONFIG;
}
public static class Param {
// client.*
public static class Client {
public static final String SMB_AUTHENTICATIONS = SmbClient.SMB_AUTHENTICATIONS_PROPERTY;
public static final String SMB1_AUTHENTICATIONS = org.codelibs.fess.crawler.client.smb1.SmbClient.SMB_AUTHENTICATIONS_PROPERTY;
public static final String FTP_AUTHENTICATIONS = FtpClient.FTP_AUTHENTICATIONS_PROPERTY;
public static final String ROBOTS_TXT_ENABLED = HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY;
public static final String PROXY_PASSWORD = "proxyPassword";
public static final String PROXY_USERNAME = "proxyUsername";
public static final String PROXY_PORT = HcHttpClient.PROXY_PORT_PROPERTY;
public static final String PROXY_HOST = HcHttpClient.PROXY_HOST_PROPERTY;
public static final String USER_AGENT = HcHttpClient.USER_AGENT_PROPERTY;
}
// xpath.*
public static class XPath {
public static final String DEFAULT_LANG = "default.lang";
public static final String DEFAULT_CONTENT = "default.content";
public static final String DEFAULT_DIGEST = "default.digest";
// xapth.<field>=<value>
}
// config.*
public static class Config {
public static final String KEEP_ORIGINAL_BODY = "keep.original.body";
public static final String CLEANUP_ALL = "cleanup.all";
public static final String CLEANUP_URL_FILTERS = "cleanup.urlFilters";
public static final String JCIFS_PREFIX = "jcifs.";
public static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
public static final String HTML_PRUNED_TAGS = "html.pruned.tags";
public static final String PIPELINE = "pipeline";
public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
public static final String SCRIPT_TYPE = "script.type";
}
// meta.*
// meta.<field>=<value>
// value.*
// value.<field>=<value>
// script.*
// script.<field>=<value>
// field.*
// field.<field>=<value>
}
}