Permalink
Browse files

make the smp way RTS-only, normal libraries now work with -smp

We had to bite the bullet here and add an extra word to every thunk,
to enable running ordinary libraries on SMP.  Otherwise, we would have
needed to ship an extra set of libraries with GHC 6.6 in addition to
the two sets we already ship (normal + profiled), and all Cabal
packages would have to be compiled for SMP too.  We decided it best
just to take the hit now, making SMP easily accessible to everyone in
GHC 6.6.

Incedentally, although this increases allocation by around 12% on
average, the performance hit is around 5%, and much less if your inner
loop doesn't use any laziness.
  • Loading branch information...
1 parent 76e3742 commit beb5737b7ee42c4e9373a505e7d957206d69a30e Simon Marlow committed Feb 8, 2006
@@ -32,7 +32,7 @@ import MachOp
import SMRep
import ForeignCall
import Constants
-import StaticFlags ( opt_SccProfilingOn, opt_SMP )
+import StaticFlags ( opt_SccProfilingOn )
import Outputable
import Monad ( when )
@@ -85,11 +85,10 @@ emitForeignCall results (CCall (CCallSpec target cconv safety)) args live
)
stmtC (the_call vols)
stmtC (CmmCall (CmmForeignCall resumeThread CCallConv)
- (if opt_SMP then [(CmmGlobal BaseReg, PtrHint)] else [])
- -- Assign the result to BaseReg: we might now have
- -- a different Capability! Small optimisation:
- -- only do this in SMP mode, where there are >1
- -- Capabilities.
+ [ (CmmGlobal BaseReg, PtrHint) ]
+ -- Assign the result to BaseReg: we
+ -- might now have a different
+ -- Capability!
[ (CmmReg id, PtrHint) ]
(Just vols)
)
@@ -23,7 +23,6 @@ module CgHeapery (
#include "HsVersions.h"
-import Constants ( mIN_UPD_SIZE )
import StgSyn ( AltType(..) )
import CLabel ( CLabel, mkRtsCodeLabel )
import CgUtils ( mkWordCLit, cmmRegOffW, cmmOffsetW,
@@ -212,8 +211,7 @@ mkStaticClosureFields cl_info ccs caf_refs payload
padding_wds
| not is_caf = []
- | otherwise = replicate n (mkIntCLit 0) -- a bunch of 0s
- where n = max 0 (mIN_UPD_SIZE - length payload)
+ | otherwise = ASSERT(null payload) [mkIntCLit 0]
static_link_field
| is_caf || staticClosureNeedsLink cl_info = [static_link_value]
@@ -28,7 +28,7 @@ import SMRep
import PrimOp ( PrimOp(..) )
import SMRep ( tablesNextToCode )
import Constants ( wORD_SIZE, wORD_SIZE_IN_BITS )
-import StaticFlags ( opt_Parallel, opt_SMP )
+import StaticFlags ( opt_Parallel )
import Outputable
-- ---------------------------------------------------------------------------
@@ -113,9 +113,6 @@ emitPrimOp [res_r,res_c] IntSubCOp [aa,bb] live
emitPrimOp [res] ParOp [arg] live
- | not (opt_Parallel || opt_SMP)
- = stmtC (CmmAssign res (CmmLit (mkIntCLit 1)))
- | otherwise
= do
-- for now, just implement this in a C function
-- later, we might want to inline it.
@@ -61,11 +61,10 @@ import SMRep -- all of it
import CLabel
-import Constants ( mIN_UPD_SIZE, mIN_SIZE_NonUpdHeapObject )
+import Constants ( mIN_PAYLOAD_SIZE )
import Packages ( isDllName, HomeModules )
import StaticFlags ( opt_SccProfilingOn, opt_OmitBlackHoling,
- opt_Parallel, opt_DoTickyProfiling,
- opt_SMP )
+ opt_Parallel, opt_DoTickyProfiling )
import Id ( Id, idType, idArity, idName )
import DataCon ( DataCon, dataConTyCon, isNullaryRepDataCon, dataConName )
import Name ( Name, nameUnique, getOccName, getOccString )
@@ -387,16 +386,8 @@ Computing slop size. WARNING: this looks dodgy --- it has deep
knowledge of what the storage manager does with the various
representations...
-Slop Requirements:
-
- - Updatable closures must be mIN_UPD_SIZE.
-
- - Heap-resident Closures must be mIN_SIZE_NonUpdHeapObject
- (to make room for an StgEvacuated during GC).
-
-In SMP mode, we don't play the mIN_UPD_SIZE game. Instead, every
-thunk gets an extra padding word in the header, which takes the
-the updated value.
+Slop Requirements: every thunk gets an extra padding word in the
+header, which takes the the updated value.
\begin{code}
slopSize cl_info = computeSlopSize payload_size cl_info
@@ -423,16 +414,14 @@ minPayloadSize smrep updatable
BlackHoleRep -> min_upd_size
GenericRep _ _ _ _ | updatable -> min_upd_size
GenericRep True _ _ _ -> 0 -- static
- GenericRep False _ _ _ -> mIN_SIZE_NonUpdHeapObject
+ GenericRep False _ _ _ -> mIN_PAYLOAD_SIZE
-- ^^^^^___ dynamic
where
- min_upd_size
- | opt_SMP = ASSERT(mIN_SIZE_NonUpdHeapObject <=
- sIZEOF_StgSMPThunkHeader)
- 0 -- check that we already have enough
- -- room for mIN_SIZE_NonUpdHeapObject,
- -- due to the extra header word in SMP
- | otherwise = mIN_UPD_SIZE
+ min_upd_size =
+ ASSERT(mIN_PAYLOAD_SIZE <= sIZEOF_StgSMPThunkHeader)
+ 0 -- check that we already have enough
+ -- room for mIN_SIZE_NonUpdHeapObject,
+ -- due to the extra header word in SMP
\end{code}
%************************************************************************
@@ -600,9 +589,11 @@ getCallMethod hmods name (LFThunk _ _ updatable std_form_info is_fun) n_args
-- is the fast-entry code]
| updatable || opt_DoTickyProfiling -- to catch double entry
- || opt_SMP -- Always enter via node on SMP, since the
- -- thunk might have been blackholed in the
- -- meantime.
+ {- OLD: || opt_SMP
+ I decided to remove this, because in SMP mode it doesn't matter
+ if we enter the same thunk multiple times, so the optimisation
+ of jumping directly to the entry code is still valid. --SDM
+ -}
= ASSERT( n_args == 0 ) EnterIt
| otherwise -- Jump direct to code for single-entry thunks
@@ -43,7 +43,7 @@ import Type ( Type, typePrimRep, PrimRep(..) )
import TyCon ( TyCon, tyConPrimRep )
import MachOp-- ( MachRep(..), MachHint(..), wordRep )
import StaticFlags ( opt_SccProfilingOn, opt_GranMacros,
- opt_Unregisterised, opt_SMP )
+ opt_Unregisterised )
import Constants
import Outputable
@@ -289,8 +289,7 @@ arrPtrsHdrSize = fixedHdrSize*wORD_SIZE + sIZEOF_StgMutArrPtrs_NoHdr
-- Thunks have an extra header word on SMP, so the update doesn't
-- splat the payload.
thunkHdrSize :: WordOff
-thunkHdrSize | opt_SMP = fixedHdrSize + smp_hdr
- | otherwise = fixedHdrSize
+thunkHdrSize = fixedHdrSize + smp_hdr
where smp_hdr = sIZEOF_StgSMPThunkHeader `quot` wORD_SIZE
\end{code}
@@ -254,6 +254,7 @@ mkBits findLabel st proto_insns
ALLOC_AP n -> instr2 st bci_ALLOC_AP n
ALLOC_PAP arity n -> instr3 st bci_ALLOC_PAP arity n
MKAP off sz -> instr3 st bci_MKAP off sz
+ MKPAP off sz -> instr3 st bci_MKPAP off sz
UNPACK n -> instr2 st bci_UNPACK n
PACK dcon sz -> do (itbl_no,st2) <- itbl st dcon
instr3 st2 bci_PACK itbl_no sz
@@ -398,6 +399,7 @@ instrSize16s instr
ALLOC_AP{} -> 2
ALLOC_PAP{} -> 3
MKAP{} -> 3
+ MKPAP{} -> 3
UNPACK{} -> 2
PACK{} -> 3
LABEL{} -> 0 -- !!
@@ -52,7 +52,7 @@ import Bitmap ( intsToReverseBitmap, mkBitmap )
import OrdList
import Constants ( wORD_SIZE )
-import Data.List ( intersperse, sortBy, zip4, zip5, partition )
+import Data.List ( intersperse, sortBy, zip4, zip6, partition )
import Foreign ( Ptr, castPtr, mallocBytes, pokeByteOff, Word8,
withForeignPtr )
import Foreign.C ( CInt )
@@ -361,26 +361,28 @@ schemeE d s p (AnnLet binds (_,body))
zipE = zipEqual "schemeE"
-- ToDo: don't build thunks for things with no free variables
- build_thunk dd [] size bco off
- = returnBc (PUSH_BCO bco
- `consOL` unitOL (MKAP (off+size) size))
- build_thunk dd (fv:fvs) size bco off = do
+ build_thunk dd [] size bco off arity
+ = returnBc (PUSH_BCO bco `consOL` unitOL (mkap (off+size) size))
+ where
+ mkap | arity == 0 = MKAP
+ | otherwise = MKPAP
+ build_thunk dd (fv:fvs) size bco off arity = do
(push_code, pushed_szw) <- pushAtom dd p' (AnnVar fv)
- more_push_code <- build_thunk (dd+pushed_szw) fvs size bco off
+ more_push_code <- build_thunk (dd+pushed_szw) fvs size bco off arity
returnBc (push_code `appOL` more_push_code)
alloc_code = toOL (zipWith mkAlloc sizes arities)
where mkAlloc sz 0 = ALLOC_AP sz
mkAlloc sz arity = ALLOC_PAP arity sz
- compile_bind d' fvs x rhs size off = do
+ compile_bind d' fvs x rhs size arity off = do
bco <- schemeR fvs (x,rhs)
- build_thunk d' fvs size bco off
+ build_thunk d' fvs size bco off arity
compile_binds =
- [ compile_bind d' fvs x rhs size n
- | (fvs, x, rhs, size, n) <-
- zip5 fvss xs rhss sizes [n_binds, n_binds-1 .. 1]
+ [ compile_bind d' fvs x rhs size arity n
+ | (fvs, x, rhs, size, arity, n) <-
+ zip6 fvss xs rhss sizes arities [n_binds, n_binds-1 .. 1]
]
in do
body_code <- schemeE d' s p' body
@@ -89,7 +89,8 @@ data BCInstr
-- To do with the heap
| ALLOC_AP Int -- make an AP with this many payload words
| ALLOC_PAP Int Int -- make a PAP with this arity / payload words
- | MKAP Int{-ptr to AP/PAP is this far down stack-} Int{-# words-}
+ | MKAP Int{-ptr to AP is this far down stack-} Int{-# words-}
+ | MKPAP Int{-ptr to PAP is this far down stack-} Int{-# words-}
| UNPACK Int -- unpack N words from t.o.s Constr
| PACK DataCon Int
-- after assembly, the DataCon is an index into the
@@ -250,5 +251,6 @@ bciStackUse SWIZZLE{} = 0
-- so can't use this info. Not that it matters much.
bciStackUse SLIDE{} = 0
bciStackUse MKAP{} = 0
+bciStackUse MKPAP{} = 0
bciStackUse PACK{} = 1 -- worst case is PACK 0 words
\end{code}
@@ -16,7 +16,7 @@ import NameEnv
import SMRep ( typeCgRep )
import DataCon ( DataCon, dataConRepArgTys )
import TyCon ( TyCon, tyConFamilySize, isDataTyCon, tyConDataCons )
-import Constants ( mIN_SIZE_NonUpdHeapObject, wORD_SIZE )
+import Constants ( mIN_PAYLOAD_SIZE, wORD_SIZE )
import CgHeapery ( mkVirtHeapOffsets )
import FastString ( FastString(..) )
import Util ( lengthIs, listLengthCmp )
@@ -94,8 +94,8 @@ make_constr_itbls cons
ptrs = ptr_wds
nptrs = tot_wds - ptr_wds
nptrs_really
- | ptrs + nptrs >= mIN_SIZE_NonUpdHeapObject = nptrs
- | otherwise = mIN_SIZE_NonUpdHeapObject - ptrs
+ | ptrs + nptrs >= mIN_PAYLOAD_SIZE = nptrs
+ | otherwise = mIN_PAYLOAD_SIZE - ptrs
itbl = StgInfoTable {
ptrs = fromIntegral ptrs,
nptrs = fromIntegral nptrs_really,
@@ -40,8 +40,7 @@ mAX_SPEC_SELECTEE_SIZE = (MAX_SPEC_SELECTEE_SIZE :: Int)
mAX_SPEC_AP_SIZE = (MAX_SPEC_AP_SIZE :: Int)
-- closure sizes: these do NOT include the header (see below for header sizes)
-mIN_UPD_SIZE = (MIN_UPD_SIZE::Int)
-mIN_SIZE_NonUpdHeapObject = (MIN_NONUPD_SIZE::Int)
+mIN_PAYLOAD_SIZE = (MIN_PAYLOAD_SIZE::Int)
\end{code}
\begin{code}
@@ -32,7 +32,6 @@ module StaticFlags (
opt_MaxContextReductionDepth,
opt_IrrefutableTuples,
opt_Parallel,
- opt_SMP,
opt_RuntimeTypes,
opt_Flatten,
@@ -256,7 +255,6 @@ opt_DictsStrict = lookUp FSLIT("-fdicts-strict")
opt_IrrefutableTuples = lookUp FSLIT("-firrefutable-tuples")
opt_MaxContextReductionDepth = lookup_def_int "-fcontext-stack" mAX_CONTEXT_REDUCTION_DEPTH
opt_Parallel = lookUp FSLIT("-fparallel")
-opt_SMP = lookUp FSLIT("-fsmp")
opt_Flatten = lookUp FSLIT("-fflatten")
-- optimisation opts
@@ -315,7 +313,6 @@ isStaticFlag f =
"fdicts-strict",
"firrefutable-tuples",
"fparallel",
- "fsmp",
"fflatten",
"fsemi-tagging",
"flet-no-escape",
@@ -558,15 +555,15 @@ way_details =
, "-optc-DGRAN"
, "-package concurrent" ]),
- (WaySMP, Way "s" False "SMP"
- [ "-fsmp"
+ (WaySMP, Way "s" True "SMP"
+ [
#if !defined(mingw32_TARGET_OS)
- , "-optc-pthread"
+ "-optc-pthread"
#endif
#if !defined(mingw32_TARGET_OS) && !defined(freebsd_TARGET_OS)
, "-optl-pthread"
#endif
- , "-optc-DSMP" ]),
+ ]),
(WayNDP, Way "ndp" False "Nested data parallelism"
[ "-fparr"
View
@@ -52,28 +52,29 @@
#define bci_ALLOC_AP 27
#define bci_ALLOC_PAP 28
#define bci_MKAP 29
-#define bci_UNPACK 30
-#define bci_PACK 31
-#define bci_TESTLT_I 32
-#define bci_TESTEQ_I 33
-#define bci_TESTLT_F 34
-#define bci_TESTEQ_F 35
-#define bci_TESTLT_D 36
-#define bci_TESTEQ_D 37
-#define bci_TESTLT_P 38
-#define bci_TESTEQ_P 39
-#define bci_CASEFAIL 40
-#define bci_JMP 41
-#define bci_CCALL 42
-#define bci_SWIZZLE 43
-#define bci_ENTER 44
-#define bci_RETURN 45
-#define bci_RETURN_P 46
-#define bci_RETURN_N 47
-#define bci_RETURN_F 48
-#define bci_RETURN_D 49
-#define bci_RETURN_L 50
-#define bci_RETURN_V 51
+#define bci_MKPAP 30
+#define bci_UNPACK 31
+#define bci_PACK 32
+#define bci_TESTLT_I 33
+#define bci_TESTEQ_I 34
+#define bci_TESTLT_F 35
+#define bci_TESTEQ_F 36
+#define bci_TESTLT_D 37
+#define bci_TESTEQ_D 38
+#define bci_TESTLT_P 39
+#define bci_TESTEQ_P 40
+#define bci_CASEFAIL 41
+#define bci_JMP 42
+#define bci_CCALL 43
+#define bci_SWIZZLE 44
+#define bci_ENTER 45
+#define bci_RETURN 46
+#define bci_RETURN_P 47
+#define bci_RETURN_N 48
+#define bci_RETURN_F 49
+#define bci_RETURN_D 50
+#define bci_RETURN_L 51
+#define bci_RETURN_V 52
/* If a BCO definitely requires less than this many words of stack,
don't include an explicit STKCHECK insn in it. The interpreter
Oops, something went wrong.

0 comments on commit beb5737

Please sign in to comment.