3
3
#include " utils.h"
4
4
5
5
#include < cassert>
6
+ #include < cerrno>
6
7
#include < cmath>
7
8
#include < cstdio>
8
9
#include < cstring>
9
10
#include < fstream>
10
11
#include < map>
11
12
#include < string>
12
13
#include < vector>
14
+ #include < atomic>
13
15
14
16
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
17
+ #include < fcntl.h>
15
18
#include < signal.h>
16
19
#include < unistd.h>
20
+ #include < sys/mman.h>
21
+ #include < sys/stat.h>
17
22
#endif
18
23
24
+ #define ROUNDUP (X, K ) (((X) + (K)-1 ) & -(K))
25
+ #define IS2POW (X ) (!((X) & ((X)-1 )))
26
+
27
+ #define MAGIC_PATH " magic.dat"
28
+ #define MAGIC_ADDR (char *)0x330000000000
29
+ #define MAGIC_GRAN 2097152
30
+ #define MAGIC_ALGN (sizeof (size_t ) * 2 )
31
+
19
32
#define ANSI_COLOR_RED " \x1b [31m"
20
33
#define ANSI_COLOR_GREEN " \x1b [32m"
21
34
#define ANSI_COLOR_YELLOW " \x1b [33m"
@@ -83,6 +96,173 @@ struct llama_model {
83
96
std::map<std::string, struct ggml_tensor *> tensors;
84
97
};
85
98
99
+ struct magic {
100
+ uint32_t magic;
101
+ std::atomic<unsigned > lock;
102
+ int fd;
103
+ size_t commit;
104
+ size_t offset;
105
+ size_t capacity;
106
+ gpt_vocab *vocab;
107
+ llama_model *model;
108
+ };
109
+
110
+ static struct magic *mag;
111
+
112
+ static inline void spin_lock (std::atomic<unsigned > &lock) {
113
+ while (!lock.exchange (1 , std::memory_order_acquire));
114
+ }
115
+
116
+ static inline void spin_unlock (std::atomic<unsigned > &lock) {
117
+ lock.store (0 , std::memory_order_release);
118
+ }
119
+
120
+ static void *Mmap (void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
121
+ void *res;
122
+ res = mmap (addr, length, prot, flags, fd, offset);
123
+ if (res != MAP_FAILED) return res;
124
+ perror (" mmap" );
125
+ exit (77 );
126
+ }
127
+
128
+ static void magic_commit (void ) {
129
+ mag->offset = mag->capacity ;
130
+ mag->commit = mag->capacity ;
131
+ mag->magic = 0xFEEDABEE ;
132
+ msync (mag, mag->commit , MS_ASYNC);
133
+ }
134
+
135
+ static void magic_init (void ) {
136
+ int fd;
137
+ size_t n;
138
+ struct stat st;
139
+ if (mag) return ;
140
+ n = ROUNDUP (sizeof (struct magic ), MAGIC_GRAN);
141
+ if ((fd = open (MAGIC_PATH, O_RDWR)) != -1 ) {
142
+ fstat (fd, &st);
143
+ if (st.st_size >= n) {
144
+ mag = (struct magic *)Mmap (MAGIC_ADDR, n,
145
+ PROT_READ | PROT_WRITE,
146
+ MAP_PRIVATE | MAP_FIXED, fd, 0 );
147
+ if (mag->magic == 0xFEEDABEE ) {
148
+ mag = (struct magic *)Mmap (MAGIC_ADDR, mag->capacity ,
149
+ PROT_READ | PROT_WRITE,
150
+ MAP_PRIVATE | MAP_FIXED, fd, 0 );
151
+ madvise (MAGIC_ADDR, mag->capacity , MADV_WILLNEED);
152
+ ftruncate (fd, mag->commit );
153
+ mag->offset = mag->commit ;
154
+ mag->capacity = mag->commit ;
155
+ mag->fd = -1 ;
156
+ return ;
157
+ }
158
+ }
159
+ ftruncate (fd, 0 );
160
+ } else if ((fd = open (MAGIC_PATH, O_RDWR | O_CREAT | O_TRUNC, 0644 )) == -1 ) {
161
+ perror (MAGIC_PATH);
162
+ exit (77 );
163
+ }
164
+ ftruncate (fd, n);
165
+ mag = (struct magic *)Mmap (MAGIC_ADDR, n,
166
+ PROT_READ | PROT_WRITE,
167
+ MAP_SHARED | MAP_FIXED, fd, 0 );
168
+ mag->offset = MAGIC_GRAN;
169
+ mag->fd = fd;
170
+ }
171
+
172
+ void *memalign (size_t a, size_t n) {
173
+ void *p;
174
+ size_t i, j, k, m;
175
+ static int count;
176
+ magic_init ();
177
+ if (a < MAGIC_ALGN) a = MAGIC_ALGN;
178
+ while (!IS2POW (a)) ++a;
179
+ m = n ? n : 1 ;
180
+ spin_lock (mag->lock );
181
+ i = mag->offset ;
182
+ i = i + sizeof (size_t );
183
+ i = ROUNDUP (i, a);
184
+ j = ROUNDUP (i + m, MAGIC_GRAN);
185
+ if (j > mag->capacity ) {
186
+ if (!mag->magic ) {
187
+ ftruncate (mag->fd , j);
188
+ p = mmap (MAGIC_ADDR + mag->capacity ,
189
+ j - mag->capacity , PROT_READ | PROT_WRITE,
190
+ MAP_SHARED | MAP_FIXED, mag->fd , mag->capacity );
191
+ } else {
192
+ p = mmap (MAGIC_ADDR + mag->capacity ,
193
+ j - mag->capacity , PROT_READ | PROT_WRITE,
194
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1 , 0 );
195
+ }
196
+ if (p != MAP_FAILED) {
197
+ mag->capacity = j;
198
+ } else {
199
+ spin_unlock (mag->lock );
200
+ return 0 ;
201
+ }
202
+ }
203
+ mag->offset = i + m;
204
+ spin_unlock (mag->lock );
205
+ p = MAGIC_ADDR + i;
206
+ ((size_t *)p)[-1 ] = n;
207
+ return p;
208
+ }
209
+
210
+ int posix_memalign (void **pp, size_t a, size_t n) {
211
+ int e;
212
+ void *m;
213
+ size_t q, r;
214
+ q = a / sizeof (void *);
215
+ r = a % sizeof (void *);
216
+ if (!r && q && IS2POW (q)) {
217
+ e = errno;
218
+ m = memalign (a, n);
219
+ if (m) {
220
+ *pp = m;
221
+ return 0 ;
222
+ } else {
223
+ errno = e;
224
+ return ENOMEM;
225
+ }
226
+ } else {
227
+ return EINVAL;
228
+ }
229
+ }
230
+
231
+ void *malloc (size_t n) {
232
+ return memalign (MAGIC_ALGN, n);
233
+ }
234
+
235
+ size_t malloc_usable_size (const void *p) {
236
+ return ((const size_t *)p)[-1 ];
237
+ }
238
+
239
+ void *calloc (size_t n, size_t z) {
240
+ void *p;
241
+ if ((p = malloc ((n *= z)))) {
242
+ memset (p, 0 , n);
243
+ }
244
+ return p;
245
+ }
246
+
247
+ void free (void *p) {
248
+ // do nothing
249
+ }
250
+
251
+ void *realloc (void *p, size_t n) {
252
+ void *q;
253
+ if (!p) {
254
+ return malloc (n);
255
+ }
256
+ if (!n) {
257
+ free (p);
258
+ return 0 ;
259
+ }
260
+ if ((q = malloc (n))) {
261
+ memcpy (q, p, ((const size_t *)p)[-1 ]);
262
+ }
263
+ return q;
264
+ }
265
+
86
266
// load the model's weights from a file
87
267
bool llama_model_load (const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
88
268
fprintf (stderr, " %s: loading model from '%s' - please wait ...\n " , __func__, fname.c_str ());
@@ -786,6 +966,8 @@ const char * llama_print_system_info(void) {
786
966
}
787
967
788
968
int main (int argc, char ** argv) {
969
+ magic_init ();
970
+
789
971
ggml_time_init ();
790
972
const int64_t t_main_start_us = ggml_time_us ();
791
973
@@ -812,19 +994,24 @@ int main(int argc, char ** argv) {
812
994
813
995
int64_t t_load_us = 0 ;
814
996
815
- gpt_vocab vocab;
816
- llama_model model;
817
-
818
997
// load the model
819
- {
998
+ gpt_vocab *vocab;
999
+ llama_model *model;
1000
+ if (!mag->magic ) {
1001
+ vocab = new gpt_vocab;
1002
+ model = new llama_model;
820
1003
const int64_t t_start_us = ggml_time_us ();
821
-
822
- if (!llama_model_load (params.model , model, vocab, 512 )) { // TODO: set context from user input ??
1004
+ if (!llama_model_load (params.model , *model, *vocab, 512 )) { // TODO: set context from user input ??
823
1005
fprintf (stderr, " %s: failed to load model from '%s'\n " , __func__, params.model .c_str ());
824
1006
return 1 ;
825
1007
}
826
-
827
1008
t_load_us = ggml_time_us () - t_start_us;
1009
+ mag->vocab = vocab;
1010
+ mag->model = model;
1011
+ magic_commit ();
1012
+ } else {
1013
+ vocab = mag->vocab ;
1014
+ model = mag->model ;
828
1015
}
829
1016
830
1017
// print system information
@@ -842,18 +1029,18 @@ int main(int argc, char ** argv) {
842
1029
std::vector<float > logits;
843
1030
844
1031
// tokenize the prompt
845
- std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize (vocab, params.prompt , true );
1032
+ std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize (* vocab, params.prompt , true );
846
1033
847
- params.n_predict = std::min (params.n_predict , model. hparams .n_ctx - (int ) embd_inp.size ());
1034
+ params.n_predict = std::min (params.n_predict , model-> hparams .n_ctx - (int ) embd_inp.size ());
848
1035
849
1036
// tokenize the reverse prompt
850
- std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize (vocab, params.antiprompt , false );
1037
+ std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize (* vocab, params.antiprompt , false );
851
1038
852
1039
fprintf (stderr, " \n " );
853
1040
fprintf (stderr, " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
854
1041
fprintf (stderr, " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
855
1042
for (int i = 0 ; i < (int ) embd_inp.size (); i++) {
856
- fprintf (stderr, " %6d -> '%s'\n " , embd_inp[i], vocab. id_to_token .at (embd_inp[i]).c_str ());
1043
+ fprintf (stderr, " %6d -> '%s'\n " , embd_inp[i], vocab-> id_to_token .at (embd_inp[i]).c_str ());
857
1044
}
858
1045
fprintf (stderr, " \n " );
859
1046
if (params.interactive ) {
@@ -871,7 +1058,7 @@ int main(int argc, char ** argv) {
871
1058
fprintf (stderr, " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
872
1059
fprintf (stderr, " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
873
1060
for (int i = 0 ; i < (int ) antiprompt_inp.size (); i++) {
874
- fprintf (stderr, " %6d -> '%s'\n " , antiprompt_inp[i], vocab. id_to_token .at (antiprompt_inp[i]).c_str ());
1061
+ fprintf (stderr, " %6d -> '%s'\n " , antiprompt_inp[i], vocab-> id_to_token .at (antiprompt_inp[i]).c_str ());
875
1062
}
876
1063
fprintf (stderr, " \n " );
877
1064
}
@@ -883,7 +1070,7 @@ int main(int argc, char ** argv) {
883
1070
884
1071
// determine the required inference memory per token:
885
1072
size_t mem_per_token = 0 ;
886
- llama_eval (model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
1073
+ llama_eval (* model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
887
1074
888
1075
int last_n_size = params.repeat_last_n ;
889
1076
std::vector<gpt_vocab::id> last_n_tokens (last_n_size);
@@ -918,7 +1105,7 @@ int main(int argc, char ** argv) {
918
1105
if (embd.size () > 0 ) {
919
1106
const int64_t t_start_us = ggml_time_us ();
920
1107
921
- if (!llama_eval (model, params.n_threads , n_past, embd, logits, mem_per_token)) {
1108
+ if (!llama_eval (* model, params.n_threads , n_past, embd, logits, mem_per_token)) {
922
1109
fprintf (stderr, " Failed to predict\n " );
923
1110
return 1 ;
924
1111
}
@@ -936,14 +1123,14 @@ int main(int argc, char ** argv) {
936
1123
const float temp = params.temp ;
937
1124
const float repeat_penalty = params.repeat_penalty ;
938
1125
939
- const int n_vocab = model. hparams .n_vocab ;
1126
+ const int n_vocab = model-> hparams .n_vocab ;
940
1127
941
1128
gpt_vocab::id id = 0 ;
942
1129
943
1130
{
944
1131
const int64_t t_start_sample_us = ggml_time_us ();
945
1132
946
- id = llama_sample_top_p_top_k (vocab, logits.data () + (logits.size () - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
1133
+ id = llama_sample_top_p_top_k (* vocab, logits.data () + (logits.size () - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
947
1134
948
1135
last_n_tokens.erase (last_n_tokens.begin ());
949
1136
last_n_tokens.push_back (id);
@@ -980,7 +1167,7 @@ int main(int argc, char ** argv) {
980
1167
// display text
981
1168
if (!input_noecho) {
982
1169
for (auto id : embd) {
983
- printf (" %s" , vocab. id_to_token [id].c_str ());
1170
+ printf (" %s" , vocab-> id_to_token [id].c_str ());
984
1171
}
985
1172
fflush (stdout);
986
1173
}
@@ -1018,7 +1205,7 @@ int main(int argc, char ** argv) {
1018
1205
buf[n_read+1 ] = 0 ;
1019
1206
}
1020
1207
1021
- std::vector<gpt_vocab::id> line_inp = ::llama_tokenize (vocab, buf, false );
1208
+ std::vector<gpt_vocab::id> line_inp = ::llama_tokenize (* vocab, buf, false );
1022
1209
embd_inp.insert (embd_inp.end (), line_inp.begin (), line_inp.end ());
1023
1210
1024
1211
remaining_tokens -= line_inp.size ();
@@ -1050,7 +1237,7 @@ int main(int argc, char ** argv) {
1050
1237
fprintf (stderr, " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
1051
1238
}
1052
1239
1053
- ggml_free (model. ctx );
1240
+ ggml_free (model-> ctx );
1054
1241
1055
1242
return 0 ;
1056
1243
}
0 commit comments