forked from bdionne/indexer
/
indexer_misc.erl
138 lines (115 loc) · 3.52 KB
/
indexer_misc.erl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
%% ---
%% Excerpted from "Programming Erlang",
%% published by The Pragmatic Bookshelf.
%% Copyrights apply to this code. It may not be used to create training material,
%% courses, books, articles, and the like. Contact us if you are in doubt.
%% We make no guarantees that this code is fit for any purpose.
%% Visit http://www.pragmaticprogrammer.com/titles/jaerlang for more book information.
%%
%% Original copyright: "(c) 2007 armstrongonsoftware"
%%---
-module(indexer_misc).
-author('Joe Armstrong').
%%
%% minor tweaks to integrate with couchdb
-author('Bob Dionne').
-export([foreach_word_in_string/3,
mapreduce/4, search/4]).
-import(lists, [filter/2, foreach/2, map/2, reverse/1]).
-include("indexer.hrl").
foreach_word_in_string(Str, F, Acc) ->
case get_word(Str) of
no ->
Acc;
{Word, Str1} ->
Acc1 = F(Word, Acc),
foreach_word_in_string(Str1, F, Acc1)
end.
isWordChar(X) when $A=< X, X=<$Z -> true;
isWordChar(X) when $0=< X, X=<$9 -> true;
isWordChar(X) when $a=< X, X=<$z -> true;
isWordChar(_) -> false.
get_word([H|T]) ->
case isWordChar(H) of
true -> collect_word(T, [H]);
false -> get_word(T)
end;
get_word([]) ->
no.
collect_word([H|T]=All, L) ->
case isWordChar(H) of
true -> collect_word(T, [H|L]);
false -> {reverse(L), All}
end;
collect_word([], L) ->
{reverse(L), []}.
mapreduce(F1, F2, Acc0, L) ->
S = self(),
Pid = spawn(fun() -> reduce(S, F1, F2, Acc0, L) end),
receive
{Pid, Result} ->
Result
end.
reduce(Parent, F1, F2, Acc0, L) ->
process_flag(trap_exit, true),
ReducePid = self(),
%% Create the Map processes
%% One for each element X in L
foreach(fun(X) ->
spawn_link(fun() -> do_job(ReducePid, F1, X) end)
end, L),
N = length(L),
%% make a dictionary to store the Keys
Dict0 = dict:new(),
%% Wait for N Map processes to terminate
Dict1 = collect_replies(N, Dict0),
Acc = dict:fold(F2, Acc0, Dict1),
Parent ! {self(), Acc}.
%% collect_replies(N, Dict)
%% collect and merge {Key, Value} messages from N processes.
%% When N processes have terminate return a dictionary
%% of {Key, [Value]} pairs
collect_replies(0, Dict) ->
Dict;
collect_replies(N, Dict) ->
receive
{'EXIT', _, _Why} ->
collect_replies(N-1, Dict);
{Key, Val} ->
case dict:is_key(Key, Dict) of
true ->
Dict1 = dict:append(Key, Val, Dict),
collect_replies(N, Dict1);
false ->
Dict1 = dict:store(Key,[Val], Dict),
collect_replies(N, Dict1)
end
end.
do_job(ReducePid, F, X) ->
F(ReducePid, X).
search(Str, Ets, DbName, Idx) ->
%% find the keywords using the same algorithm as in the indexing phase
F1 = fun(Word, Acc) -> [Word|Acc] end,
Words = indexer_misc:foreach_word_in_string(Str, F1, []),
L1 = map(fun(I) -> indexer_words:process_word(I, Ets) end, Words),
Words1 = [W || {yes, W} <- L1],
Indices =
map(fun(I) ->
indexer_couchdb_crawler:lookup_indices(I, Idx) end, Words1),
Sets = [sets:from_list(X) || X <- Indices, X =/= []],
case Sets of
[] ->
none;
_ ->
Unique = sets:intersection(Sets),
Indices1 = sets:to_list(Unique),
case length(Indices1) of
N when N > 200 ->
tooMany;
_ ->
map(fun(I) ->
{ok, Doc} = hovercraft:open_doc(DbName, I),
Doc
end, Indices1)
end
end.