-
Notifications
You must be signed in to change notification settings - Fork 8
/
get_raw_ngrams.cc
109 lines (103 loc) · 3.38 KB
/
get_raw_ngrams.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
inline void process_line(char *line_in, int N) {
static char *buf = NULL;
static int bufsz = 0;
// make sure buffer is large enough.
int need_sz = (20 + strlen(line_in))*(N+1); // should be more than enough.
if (buf == NULL || bufsz < need_sz) {
delete buf;
buf = new char[need_sz];
bufsz = need_sz;
}
char *cur_buf_ptr = buf;
char *line = line_in;
static std::vector<int> positions;
positions.clear();
if (*line == '\n') {
line += 1;
} else {
assert(!isspace(*line));
while (*line != '\0') {
positions.push_back(line - line_in);
while(!isspace(*line)) line++;
if (isspace(*line)) line++;
assert(!isspace(*line) && "get_raw_ngrams doesn't accept extra spaces!");
// We don't allow >1 space in succession.
// This program expects very specific input.
}
}
positions.push_back(line - line_in); // position one past terminating "\n".
assert(line[-1] == '\n'); // We expect the line to be
// terminated by newline. The last position is the
// position of the terminating '\0'
line[-1] = ' '; // Replace the newline by a space.
int num_words = positions.size() - 1;
for (int n = 0; n <= num_words; n++) {
// First output the history, starting from most recent word.
for (int m = n-1; m > n-N && m >= 0; m--) {
int len = positions[m+1] - positions[m] - (m == n-N+1 ? 1 : 0);
// omit the space if this is the last history we'll write.
strncpy(cur_buf_ptr, line_in + positions[m], len);
cur_buf_ptr += len;
}
if (n < N-1) { // Include A (begin-of-sentence) in the history.
strncpy(cur_buf_ptr, "A\t", 2);
cur_buf_ptr += 2;
} else {
strncpy(cur_buf_ptr, "\t", 1);
cur_buf_ptr += 1;
}
// Now write the predicted word.
if (n < num_words) {
int len = positions[n+1]-positions[n]-1;
strncpy(cur_buf_ptr, line_in + positions[n], len);
cur_buf_ptr += len;
strncpy(cur_buf_ptr, "\n", 1);
cur_buf_ptr += 1;
} else {
strncpy(cur_buf_ptr, "B\n", 2); // Predicted word is B (end-of-sentence marker).
cur_buf_ptr += 2;
}
}
int stdout_fd = 1;
write(stdout_fd, buf, cur_buf_ptr - buf);
}
int main(int argc, char **argv) {
// This program takes one argument, the value of N
// (in N-grams.) t reads the stdin, e.g.
// a b c
// d e
// and (suppose N=3), it outputs the N-grams with the histories first (reversed),
// then the predicted word, i.e.:
// A\ta
// a A\tb
// b a\tc
// c b\tB
// A\td
// etc.
// We assume A is the begin-of-sentence symbol and B is the end-of-sentence
// symbol. We'll pipe this output into sort and uniq -c.
// Note: this program will not accept lines with "extra" spaces!
if (argc != 2) {
fprintf(stderr, "Usage: get_raw_ngrams N <text\n"
"Warning: this program will not work if there are extra spaces\n"
"in the input. It uses our \"special names\" (A,B,C) for <s>,\n"
"</s> and <UNK>.\n");
exit(1);
}
int N = atoi(argv[1]);
if (N == 0) { printf("Invalid value N = %s " , argv[1]); }
size_t nbytes = 100;
char *line = (char*) malloc(nbytes + 1);
int bytes_read;
while ((bytes_read = getline(&line, &nbytes, stdin)) != -1) {
process_line(line, N);
}
delete line;
}