3232#include <linux/security.h>
3333#include <linux/memcontrol.h>
3434#include <linux/syscalls.h>
35+ #include <linux/hugetlb.h>
3536#include <linux/gfp.h>
3637
3738#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
9596 pte_t * ptep , pte ;
9697 spinlock_t * ptl ;
9798
98- pgd = pgd_offset (mm , addr );
99- if (!pgd_present (* pgd ))
100- goto out ;
99+ if (unlikely (PageHuge (new ))) {
100+ ptep = huge_pte_offset (mm , addr );
101+ if (!ptep )
102+ goto out ;
103+ ptl = & mm -> page_table_lock ;
104+ } else {
105+ pgd = pgd_offset (mm , addr );
106+ if (!pgd_present (* pgd ))
107+ goto out ;
101108
102- pud = pud_offset (pgd , addr );
103- if (!pud_present (* pud ))
104- goto out ;
109+ pud = pud_offset (pgd , addr );
110+ if (!pud_present (* pud ))
111+ goto out ;
105112
106- pmd = pmd_offset (pud , addr );
107- if (!pmd_present (* pmd ))
108- goto out ;
113+ pmd = pmd_offset (pud , addr );
114+ if (!pmd_present (* pmd ))
115+ goto out ;
109116
110- ptep = pte_offset_map (pmd , addr );
117+ ptep = pte_offset_map (pmd , addr );
111118
112- if (!is_swap_pte (* ptep )) {
113- pte_unmap (ptep );
114- goto out ;
115- }
119+ if (!is_swap_pte (* ptep )) {
120+ pte_unmap (ptep );
121+ goto out ;
122+ }
123+
124+ ptl = pte_lockptr (mm , pmd );
125+ }
116126
117- ptl = pte_lockptr (mm , pmd );
118127 spin_lock (ptl );
119128 pte = * ptep ;
120129 if (!is_swap_pte (pte ))
@@ -130,10 +139,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130139 pte = pte_mkold (mk_pte (new , vma -> vm_page_prot ));
131140 if (is_write_migration_entry (entry ))
132141 pte = pte_mkwrite (pte );
142+ if (PageHuge (new ))
143+ pte = pte_mkhuge (pte );
133144 flush_cache_page (vma , addr , pte_pfn (pte ));
134145 set_pte_at (mm , addr , ptep , pte );
135146
136- if (PageAnon (new ))
147+ if (PageHuge (new )) {
148+ if (PageAnon (new ))
149+ hugepage_add_anon_rmap (new , vma , addr );
150+ else
151+ page_dup_rmap (new );
152+ } else if (PageAnon (new ))
137153 page_add_anon_rmap (new , vma , addr );
138154 else
139155 page_add_file_rmap (new );
@@ -275,12 +291,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
275291 return 0 ;
276292}
277293
294+ /*
295+ * The expected number of remaining references is the same as that
296+ * of migrate_page_move_mapping().
297+ */
298+ int migrate_huge_page_move_mapping (struct address_space * mapping ,
299+ struct page * newpage , struct page * page )
300+ {
301+ int expected_count ;
302+ void * * pslot ;
303+
304+ if (!mapping ) {
305+ if (page_count (page ) != 1 )
306+ return - EAGAIN ;
307+ return 0 ;
308+ }
309+
310+ spin_lock_irq (& mapping -> tree_lock );
311+
312+ pslot = radix_tree_lookup_slot (& mapping -> page_tree ,
313+ page_index (page ));
314+
315+ expected_count = 2 + page_has_private (page );
316+ if (page_count (page ) != expected_count ||
317+ (struct page * )radix_tree_deref_slot (pslot ) != page ) {
318+ spin_unlock_irq (& mapping -> tree_lock );
319+ return - EAGAIN ;
320+ }
321+
322+ if (!page_freeze_refs (page , expected_count )) {
323+ spin_unlock_irq (& mapping -> tree_lock );
324+ return - EAGAIN ;
325+ }
326+
327+ get_page (newpage );
328+
329+ radix_tree_replace_slot (pslot , newpage );
330+
331+ page_unfreeze_refs (page , expected_count );
332+
333+ __put_page (page );
334+
335+ spin_unlock_irq (& mapping -> tree_lock );
336+ return 0 ;
337+ }
338+
278339/*
279340 * Copy the page to its new location
280341 */
281- static void migrate_page_copy (struct page * newpage , struct page * page )
342+ void migrate_page_copy (struct page * newpage , struct page * page )
282343{
283- copy_highpage (newpage , page );
344+ if (PageHuge (page ))
345+ copy_huge_page (newpage , page );
346+ else
347+ copy_highpage (newpage , page );
284348
285349 if (PageError (page ))
286350 SetPageError (newpage );
@@ -723,6 +787,92 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
723787 return rc ;
724788}
725789
790+ /*
791+ * Counterpart of unmap_and_move_page() for hugepage migration.
792+ *
793+ * This function doesn't wait the completion of hugepage I/O
794+ * because there is no race between I/O and migration for hugepage.
795+ * Note that currently hugepage I/O occurs only in direct I/O
796+ * where no lock is held and PG_writeback is irrelevant,
797+ * and writeback status of all subpages are counted in the reference
798+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
799+ * under direct I/O, the reference of the head page is 512 and a bit more.)
800+ * This means that when we try to migrate hugepage whose subpages are
801+ * doing direct I/O, some references remain after try_to_unmap() and
802+ * hugepage migration fails without data corruption.
803+ *
804+ * There is also no race when direct I/O is issued on the page under migration,
805+ * because then pte is replaced with migration swap entry and direct I/O code
806+ * will wait in the page fault for migration to complete.
807+ */
808+ static int unmap_and_move_huge_page (new_page_t get_new_page ,
809+ unsigned long private , struct page * hpage ,
810+ int force , int offlining )
811+ {
812+ int rc = 0 ;
813+ int * result = NULL ;
814+ struct page * new_hpage = get_new_page (hpage , private , & result );
815+ int rcu_locked = 0 ;
816+ struct anon_vma * anon_vma = NULL ;
817+
818+ if (!new_hpage )
819+ return - ENOMEM ;
820+
821+ rc = - EAGAIN ;
822+
823+ if (!trylock_page (hpage )) {
824+ if (!force )
825+ goto out ;
826+ lock_page (hpage );
827+ }
828+
829+ if (PageAnon (hpage )) {
830+ rcu_read_lock ();
831+ rcu_locked = 1 ;
832+
833+ if (page_mapped (hpage )) {
834+ anon_vma = page_anon_vma (hpage );
835+ atomic_inc (& anon_vma -> external_refcount );
836+ }
837+ }
838+
839+ try_to_unmap (hpage , TTU_MIGRATION |TTU_IGNORE_MLOCK |TTU_IGNORE_ACCESS );
840+
841+ if (!page_mapped (hpage ))
842+ rc = move_to_new_page (new_hpage , hpage , 1 );
843+
844+ if (rc )
845+ remove_migration_ptes (hpage , hpage );
846+
847+ if (anon_vma && atomic_dec_and_lock (& anon_vma -> external_refcount ,
848+ & anon_vma -> lock )) {
849+ int empty = list_empty (& anon_vma -> head );
850+ spin_unlock (& anon_vma -> lock );
851+ if (empty )
852+ anon_vma_free (anon_vma );
853+ }
854+
855+ if (rcu_locked )
856+ rcu_read_unlock ();
857+ out :
858+ unlock_page (hpage );
859+
860+ if (rc != - EAGAIN ) {
861+ list_del (& hpage -> lru );
862+ put_page (hpage );
863+ }
864+
865+ put_page (new_hpage );
866+
867+ if (result ) {
868+ if (rc )
869+ * result = rc ;
870+ else
871+ * result = page_to_nid (new_hpage );
872+ }
873+ return rc ;
874+ }
875+
726876/*
727877 * migrate_pages
728878 *
@@ -788,6 +938,52 @@ int migrate_pages(struct list_head *from,
788938 return nr_failed + retry ;
789939}
790940
941+ int migrate_huge_pages (struct list_head * from ,
942+ new_page_t get_new_page , unsigned long private , int offlining )
943+ {
944+ int retry = 1 ;
945+ int nr_failed = 0 ;
946+ int pass = 0 ;
947+ struct page * page ;
948+ struct page * page2 ;
949+ int rc ;
950+
951+ for (pass = 0 ; pass < 10 && retry ; pass ++ ) {
952+ retry = 0 ;
953+
954+ list_for_each_entry_safe (page , page2 , from , lru ) {
955+ cond_resched ();
956+
957+ rc = unmap_and_move_huge_page (get_new_page ,
958+ private , page , pass > 2 , offlining );
959+
960+ switch (rc ) {
961+ case - ENOMEM :
962+ goto out ;
963+ case - EAGAIN :
964+ retry ++ ;
965+ break ;
966+ case 0 :
967+ break ;
968+ default :
969+ /* Permanent failure */
970+ nr_failed ++ ;
971+ break ;
972+ }
973+ }
974+ }
975+ rc = 0 ;
976+ out :
977+
978+ list_for_each_entry_safe (page , page2 , from , lru )
979+ put_page (page );
980+
981+ if (rc )
982+ return rc ;
983+
984+ return nr_failed + retry ;
985+ }
986+
791987#ifdef CONFIG_NUMA
792988/*
793989 * Move a list of individual pages
0 commit comments