@@ -133,11 +133,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
133133 return - ENOMEM ;
134134}
135135
136- static inline unsigned int atomic_xor_bits (atomic_t * v , unsigned int bits )
137- {
138- return atomic_fetch_xor (bits , v ) ^ bits ;
139- }
140-
141136#ifdef CONFIG_PGSTE
142137
143138struct page * page_table_alloc_pgste (struct mm_struct * mm )
@@ -162,303 +157,85 @@ void page_table_free_pgste(struct page *page)
162157
163158#endif /* CONFIG_PGSTE */
164159
165- /*
166- * A 2KB-pgtable is either upper or lower half of a normal page.
167- * The second half of the page may be unused or used as another
168- * 2KB-pgtable.
169- *
170- * Whenever possible the parent page for a new 2KB-pgtable is picked
171- * from the list of partially allocated pages mm_context_t::pgtable_list.
172- * In case the list is empty a new parent page is allocated and added to
173- * the list.
174- *
175- * When a parent page gets fully allocated it contains 2KB-pgtables in both
176- * upper and lower halves and is removed from mm_context_t::pgtable_list.
177- *
178- * When 2KB-pgtable is freed from to fully allocated parent page that
179- * page turns partially allocated and added to mm_context_t::pgtable_list.
180- *
181- * If 2KB-pgtable is freed from the partially allocated parent page that
182- * page turns unused and gets removed from mm_context_t::pgtable_list.
183- * Furthermore, the unused parent page is released.
184- *
185- * As follows from the above, no unallocated or fully allocated parent
186- * pages are contained in mm_context_t::pgtable_list.
187- *
188- * The upper byte (bits 24-31) of the parent page _refcount is used
189- * for tracking contained 2KB-pgtables and has the following format:
190- *
191- * PP AA
192- * 01234567 upper byte (bits 24-31) of struct page::_refcount
193- * || ||
194- * || |+--- upper 2KB-pgtable is allocated
195- * || +---- lower 2KB-pgtable is allocated
196- * |+------- upper 2KB-pgtable is pending for removal
197- * +-------- lower 2KB-pgtable is pending for removal
198- *
199- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
200- * using _refcount is possible).
201- *
202- * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
203- * The parent page is either:
204- * - added to mm_context_t::pgtable_list in case the second half of the
205- * parent page is still unallocated;
206- * - removed from mm_context_t::pgtable_list in case both hales of the
207- * parent page are allocated;
208- * These operations are protected with mm_context_t::lock.
209- *
210- * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
211- * and the corresponding PP bit is set to 1 in a single atomic operation.
212- * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
213- * exclusive and may never be both set to 1!
214- * The parent page is either:
215- * - added to mm_context_t::pgtable_list in case the second half of the
216- * parent page is still allocated;
217- * - removed from mm_context_t::pgtable_list in case the second half of
218- * the parent page is unallocated;
219- * These operations are protected with mm_context_t::lock.
220- *
221- * It is important to understand that mm_context_t::lock only protects
222- * mm_context_t::pgtable_list and AA bits, but not the parent page itself
223- * and PP bits.
224- *
225- * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
226- * while both AA bits and the second PP bit are already unset. Then the
227- * parent page does not contain any 2KB-pgtable fragment anymore, and it has
228- * also been removed from mm_context_t::pgtable_list. It is safe to release
229- * the page therefore.
230- *
231- * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
232- * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
233- * while the PP bits are never used, nor such a page is added to or removed
234- * from mm_context_t::pgtable_list.
235- *
236- * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
237- * and prevents both 2K fragments from being reused. pte_free_defer() has to
238- * guarantee that its pgtable cannot be reused before the RCU grace period
239- * has elapsed (which page_table_free_rcu() does not actually guarantee).
240- * But for simplicity, because page->rcu_head overlays page->lru, and because
241- * the RCU callback might not be called before the mm_context_t has been freed,
242- * pte_free_defer() in this implementation prevents both fragments from being
243- * reused, and delays making the call to RCU until both fragments are freed.
244- */
245160unsigned long * page_table_alloc (struct mm_struct * mm )
246161{
247- unsigned long * table ;
248162 struct ptdesc * ptdesc ;
249- unsigned int mask , bit ;
250-
251- /* Try to get a fragment of a 4K page as a 2K page table */
252- if (!mm_alloc_pgste (mm )) {
253- table = NULL ;
254- spin_lock_bh (& mm -> context .lock );
255- if (!list_empty (& mm -> context .pgtable_list )) {
256- ptdesc = list_first_entry (& mm -> context .pgtable_list ,
257- struct ptdesc , pt_list );
258- mask = atomic_read (& ptdesc -> _refcount ) >> 24 ;
259- /*
260- * The pending removal bits must also be checked.
261- * Failure to do so might lead to an impossible
262- * value of (i.e 0x13 or 0x23) written to _refcount.
263- * Such values violate the assumption that pending and
264- * allocation bits are mutually exclusive, and the rest
265- * of the code unrails as result. That could lead to
266- * a whole bunch of races and corruptions.
267- */
268- mask = (mask | (mask >> 4 )) & 0x03U ;
269- if (mask != 0x03U ) {
270- table = (unsigned long * ) ptdesc_to_virt (ptdesc );
271- bit = mask & 1 ; /* =1 -> second 2K */
272- if (bit )
273- table += PTRS_PER_PTE ;
274- atomic_xor_bits (& ptdesc -> _refcount ,
275- 0x01U << (bit + 24 ));
276- list_del_init (& ptdesc -> pt_list );
277- }
278- }
279- spin_unlock_bh (& mm -> context .lock );
280- if (table )
281- return table ;
282- }
283- /* Allocate a fresh page */
163+ unsigned long * table ;
164+
284165 ptdesc = pagetable_alloc (GFP_KERNEL , 0 );
285166 if (!ptdesc )
286167 return NULL ;
287168 if (!pagetable_pte_ctor (ptdesc )) {
288169 pagetable_free (ptdesc );
289170 return NULL ;
290171 }
291- /* Initialize page table */
292172 table = ptdesc_to_virt (ptdesc );
293173 __arch_set_page_dat (table , 1 );
294- if (mm_alloc_pgste (mm )) {
295- /* Return 4K page table with PGSTEs */
296- INIT_LIST_HEAD (& ptdesc -> pt_list );
297- atomic_xor_bits (& ptdesc -> _refcount , 0x03U << 24 );
298- memset64 ((u64 * )table , _PAGE_INVALID , PTRS_PER_PTE );
299- memset64 ((u64 * )table + PTRS_PER_PTE , 0 , PTRS_PER_PTE );
300- } else {
301- /* Return the first 2K fragment of the page */
302- atomic_xor_bits (& ptdesc -> _refcount , 0x01U << 24 );
303- memset64 ((u64 * )table , _PAGE_INVALID , 2 * PTRS_PER_PTE );
304- spin_lock_bh (& mm -> context .lock );
305- list_add (& ptdesc -> pt_list , & mm -> context .pgtable_list );
306- spin_unlock_bh (& mm -> context .lock );
307- }
174+ /* pt_list is used by gmap only */
175+ INIT_LIST_HEAD (& ptdesc -> pt_list );
176+ memset64 ((u64 * )table , _PAGE_INVALID , PTRS_PER_PTE );
177+ memset64 ((u64 * )table + PTRS_PER_PTE , 0 , PTRS_PER_PTE );
308178 return table ;
309179}
310180
311- static void page_table_release_check (struct page * page , void * table ,
312- unsigned int half , unsigned int mask )
313- {
314- char msg [128 ];
315-
316- if (!IS_ENABLED (CONFIG_DEBUG_VM ))
317- return ;
318- if (!mask && list_empty (& page -> lru ))
319- return ;
320- snprintf (msg , sizeof (msg ),
321- "Invalid pgtable %p release half 0x%02x mask 0x%02x" ,
322- table , half , mask );
323- dump_page (page , msg );
324- }
325-
326- static void pte_free_now (struct rcu_head * head )
181+ static void pagetable_pte_dtor_free (struct ptdesc * ptdesc )
327182{
328- struct ptdesc * ptdesc ;
329-
330- ptdesc = container_of (head , struct ptdesc , pt_rcu_head );
331183 pagetable_pte_dtor (ptdesc );
332184 pagetable_free (ptdesc );
333185}
334186
335187void page_table_free (struct mm_struct * mm , unsigned long * table )
336188{
337- unsigned int mask , bit , half ;
338189 struct ptdesc * ptdesc = virt_to_ptdesc (table );
339190
340- if (!mm_alloc_pgste (mm )) {
341- /* Free 2K page table fragment of a 4K page */
342- bit = ((unsigned long ) table & ~PAGE_MASK )/(PTRS_PER_PTE * sizeof (pte_t ));
343- spin_lock_bh (& mm -> context .lock );
344- /*
345- * Mark the page for delayed release. The actual release
346- * will happen outside of the critical section from this
347- * function or from __tlb_remove_table()
348- */
349- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x11U << (bit + 24 ));
350- mask >>= 24 ;
351- if ((mask & 0x03U ) && !folio_test_active (ptdesc_folio (ptdesc ))) {
352- /*
353- * Other half is allocated, and neither half has had
354- * its free deferred: add page to head of list, to make
355- * this freed half available for immediate reuse.
356- */
357- list_add (& ptdesc -> pt_list , & mm -> context .pgtable_list );
358- } else {
359- /* If page is on list, now remove it. */
360- list_del_init (& ptdesc -> pt_list );
361- }
362- spin_unlock_bh (& mm -> context .lock );
363- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x10U << (bit + 24 ));
364- mask >>= 24 ;
365- if (mask != 0x00U )
366- return ;
367- half = 0x01U << bit ;
368- } else {
369- half = 0x03U ;
370- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x03U << 24 );
371- mask >>= 24 ;
372- }
373-
374- page_table_release_check (ptdesc_page (ptdesc ), table , half , mask );
375- if (folio_test_clear_active (ptdesc_folio (ptdesc )))
376- call_rcu (& ptdesc -> pt_rcu_head , pte_free_now );
377- else
378- pte_free_now (& ptdesc -> pt_rcu_head );
191+ pagetable_pte_dtor_free (ptdesc );
379192}
380193
381194void page_table_free_rcu (struct mmu_gather * tlb , unsigned long * table ,
382195 unsigned long vmaddr )
383196{
384197 struct mm_struct * mm ;
385- unsigned int bit , mask ;
386- struct ptdesc * ptdesc = virt_to_ptdesc (table );
387198
388199 mm = tlb -> mm ;
389- if (mm_alloc_pgste (mm )) {
200+ if (mm_alloc_pgste (mm ))
390201 gmap_unlink (mm , table , vmaddr );
391- table = (unsigned long * ) ((unsigned long )table | 0x03U );
392- tlb_remove_ptdesc (tlb , table );
393- return ;
394- }
395- bit = ((unsigned long ) table & ~PAGE_MASK ) / (PTRS_PER_PTE * sizeof (pte_t ));
396- spin_lock_bh (& mm -> context .lock );
397- /*
398- * Mark the page for delayed release. The actual release will happen
399- * outside of the critical section from __tlb_remove_table() or from
400- * page_table_free()
401- */
402- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x11U << (bit + 24 ));
403- mask >>= 24 ;
404- if ((mask & 0x03U ) && !folio_test_active (ptdesc_folio (ptdesc ))) {
405- /*
406- * Other half is allocated, and neither half has had
407- * its free deferred: add page to end of list, to make
408- * this freed half available for reuse once its pending
409- * bit has been cleared by __tlb_remove_table().
410- */
411- list_add_tail (& ptdesc -> pt_list , & mm -> context .pgtable_list );
412- } else {
413- /* If page is on list, now remove it. */
414- list_del_init (& ptdesc -> pt_list );
415- }
416- spin_unlock_bh (& mm -> context .lock );
417- table = (unsigned long * ) ((unsigned long ) table | (0x01U << bit ));
202+ table = (unsigned long * )((unsigned long )table | 0x01U );
418203 tlb_remove_ptdesc (tlb , table );
419204}
420205
421206void __tlb_remove_table (void * _table )
422207{
423- unsigned int mask = (unsigned long ) _table & 0x03U , half = mask ;
424- void * table = (void * )((unsigned long ) _table ^ mask );
425- struct ptdesc * ptdesc = virt_to_ptdesc (table );
426-
427- switch (half ) {
428- case 0x00U : /* pmd, pud, or p4d */
208+ struct ptdesc * ptdesc ;
209+ unsigned int mask ;
210+ void * table ;
211+
212+ mask = (unsigned long )_table & 0x01U ;
213+ table = (void * )((unsigned long )_table ^ mask );
214+ ptdesc = virt_to_ptdesc (table );
215+ if (!mask ) {
216+ /* pmd, pud, or p4d */
429217 pagetable_free (ptdesc );
430218 return ;
431- case 0x01U : /* lower 2K of a 4K page table */
432- case 0x02U : /* higher 2K of a 4K page table */
433- mask = atomic_xor_bits (& ptdesc -> _refcount , mask << (4 + 24 ));
434- mask >>= 24 ;
435- if (mask != 0x00U )
436- return ;
437- break ;
438- case 0x03U : /* 4K page table with pgstes */
439- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x03U << 24 );
440- mask >>= 24 ;
441- break ;
442219 }
443-
444- page_table_release_check (ptdesc_page (ptdesc ), table , half , mask );
445- if (folio_test_clear_active (ptdesc_folio (ptdesc )))
446- call_rcu (& ptdesc -> pt_rcu_head , pte_free_now );
447- else
448- pte_free_now (& ptdesc -> pt_rcu_head );
220+ pagetable_pte_dtor_free (ptdesc );
449221}
450222
451223#ifdef CONFIG_TRANSPARENT_HUGEPAGE
224+ static void pte_free_now (struct rcu_head * head )
225+ {
226+ struct ptdesc * ptdesc = container_of (head , struct ptdesc , pt_rcu_head );
227+
228+ pagetable_pte_dtor_free (ptdesc );
229+ }
230+
452231void pte_free_defer (struct mm_struct * mm , pgtable_t pgtable )
453232{
454- struct page * page ;
233+ struct ptdesc * ptdesc = virt_to_ptdesc ( pgtable ) ;
455234
456- page = virt_to_page (pgtable );
457- SetPageActive (page );
458- page_table_free (mm , (unsigned long * )pgtable );
235+ call_rcu (& ptdesc -> pt_rcu_head , pte_free_now );
459236 /*
460- * page_table_free() does not do the pgste gmap_unlink() which
461- * page_table_free_rcu() does: warn us if pgste ever reaches here .
237+ * THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
238+ * Turn to the generic pte_free_defer() version once gmap is removed .
462239 */
463240 WARN_ON_ONCE (mm_has_pgste (mm ));
464241}
0 commit comments